From 71cfa45e76415343e9d83b483f62d1c44cb821cc Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 28 Nov 2024 19:31:04 -0800
Subject: [PATCH 001/264] wip: initial work on mimalloc3 without segments

---
 ide/vs2022/mimalloc.vcxproj         |    6 +-
 ide/vs2022/mimalloc.vcxproj.filters |   15 +-
 include/mimalloc/bits.h             |  313 +++++
 include/mimalloc/internal.h         |  154 +--
 include/mimalloc/prim.h             |    3 -
 include/mimalloc/types.h            |   61 +-
 src/bitmap.c                        |   13 +-
 src/init.c                          |   22 +-
 src/libc.c                          |   67 +-
 src/os.c                            |   12 +
 src/page-queue.c                    |    7 +-
 src/xarena.c                        | 1777 +++++++++++++++++++++++++++
 src/xbitmap.c                       |  599 +++++++++
 src/xbitmap.h                       |   94 ++
 test/main-override-static.c         |  147 ++-
 15 files changed, 3001 insertions(+), 289 deletions(-)
 create mode 100644 include/mimalloc/bits.h
 create mode 100644 src/xarena.c
 create mode 100644 src/xbitmap.c
 create mode 100644 src/xbitmap.h
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index dddab777..138acf39 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -120,6 +120,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <PostBuildEvent>
       <Command>
@@ -219,7 +220,6 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
     </ClCompile>
@@ -252,17 +252,21 @@
     <ClCompile Include="..\..\src\segment.c" />
     <ClCompile Include="..\..\src\os.c" />
     <ClCompile Include="..\..\src\stats.c" />
+    <ClCompile Include="..\..\src\xarena.c" />
+    <ClCompile Include="..\..\src\xbitmap.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
     <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
     <ClInclude Include="..\..\include\mimalloc\atomic.h" />
+    <ClInclude Include="..\..\include\mimalloc\bits.h" />
     <ClInclude Include="..\..\include\mimalloc\internal.h" />
     <ClInclude Include="..\..\include\mimalloc\prim.h" />
     <ClInclude Include="..\..\include\mimalloc\track.h" />
     <ClInclude Include="..\..\include\mimalloc\types.h" />
     <ClInclude Include="..\..\src\bitmap.h" />
+    <ClInclude Include="..\..\src\xbitmap.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 54ee0fcb..48958be1 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -13,9 +13,6 @@
     <ClCompile Include="..\..\src\alloc-posix.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\bitmap.c">
       <Filter>Sources</Filter>
     </ClCompile>
@@ -64,6 +61,12 @@
     <ClCompile Include="..\..\src\arena-abandoned.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\xbitmap.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\xarena.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
@@ -93,6 +96,12 @@
     <ClInclude Include="..\..\include\mimalloc\prim.h">
       <Filter>Headers</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\xbitmap.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\bits.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Headers">
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
new file mode 100644
index 00000000..642f0f9c
--- /dev/null
+++ b/include/mimalloc/bits.h
@@ -0,0 +1,313 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
+---------------------------------------------------------------------------- */
+
+#pragma once
+#ifndef MI_BITS_H
+#define MI_BITS_H
+
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+/* --------------------------------------------------------------------------------
+  Architecture
+-------------------------------------------------------------------------------- */
+
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define MI_ARCH_X64       1
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
+#define MI_ARCH_X86       1
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
+#define MI_ARCH_ARM64     1
+#elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
+#define MI_ARCH_ARM32     1
+#elif defined(__riscv) || defined(_M_RISCV)
+#define MI_ARCH_RISCV     1
+#if (LONG_MAX == INT32_MAX)
+#define MI_ARCH_RISCV32   1
+#else
+#define MI_ARCH_RISCV64   1
+#endif
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__)
+#include <immintrin.h>
+#endif
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+#include <intrin.h>
+#endif
+
+#if defined(__AVX2__) && !defined(__BMI2__) // msvc
+#define __BMI2__  1
+#endif
+#if (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
+#define __BMI1__  1
+#endif
+
+/* --------------------------------------------------------------------------------
+  Builtin's
+-------------------------------------------------------------------------------- */
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+#define mi_builtin(name)        __builtin_##name
+#define mi_has_builtin(name)    __has_builtin(__builtin_##name)
+
+#if (LONG_MAX == INT32_MAX)
+#define mi_builtin32(name)       mi_builtin(name##l)
+#define mi_has_builtin32(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin32(name)       mi_builtin(name)
+#define mi_has_builtin32(name)   mi_has_builtin(name)
+#endif
+#if (LONG_MAX == INT64_MAX)
+#define mi_builtin64(name)       mi_builtin(name##l)
+#define mi_has_builtin64(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin64(name)       mi_builtin(name##ll)
+#define mi_has_builtin64(name)   mi_has_builtin(name##ll)
+#endif
+
+#if (MI_SIZE_BITS == 32)
+#define mi_builtin_size(name)       mi_builtin32(name)
+#define mi_has_builtin_size(name)   mi_has_builtin32(name)
+#elif (MI_SIZE_BITS == 64)
+#define mi_builtin_size(name)       mi_builtin64(name)
+#define mi_has_builtin_size(name)   mi_has_builtin64(name)
+#endif
+
+
+/* --------------------------------------------------------------------------------
+  Count trailing/leading zero's
+-------------------------------------------------------------------------------- */
+
+size_t _mi_clz_generic(size_t x);
+size_t _mi_ctz_generic(size_t x);
+
+static inline size_t mi_ctz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+    uint64_t r;
+    __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_ARM64
+    uint64_t r;
+    __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_RISCV
+    size_t r;
+    __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : );
+    return r;
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_tzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    #if MI_SIZE_BITS==32
+      return (_BitScanForward(&idx, x) ? (size_t)idx : 32);
+    #else
+      return (_BitScanForward64(&idx, x) ? (size_t)idx : 64);
+    #endif
+  #elif mi_has_builtin_size(ctz)
+    return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return _mi_ctz_generic(x);
+  #endif
+}
+
+static inline size_t mi_clz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+    uint64_t r;
+    __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_ARM64
+    uint64_t r;
+    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_RISCV
+    size_t r;
+    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : );
+    return r;
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_lzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    #if MI_SIZE_BITS==32
+      return (_BitScanReverse(&idx, x) ? 31 - (size_t)idx : 32);
+    #else
+      return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64);
+    #endif
+  #elif mi_has_builtin_size(clz)
+    return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return _mi_clz_generic(x);
+  #endif
+}
+
+#ifndef MI_HAS_FAST_BITSCAN
+#define MI_HAS_FAST_BITSCAN 1
+#endif
+
+/* --------------------------------------------------------------------------------
+  find trailing/leading zero  (bit scan forward/reverse)
+-------------------------------------------------------------------------------- */
+
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
+    return !is_zero;
+  #else
+    *idx = mi_ctz(x);
+    return (x!=0);
+  #endif
+}
+
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr(size_t x, size_t* idx) {
+  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    #if MI_SIZE_BITS==32
+      return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
+    #else
+      return (_BitScanReverse64(&i, x) ? (*idx = i, true) : false);
+    #endif
+  #else
+    const size_t r = mi_clz(x);
+    *idx = (~r & (MI_SIZE_BITS - 1));
+    return (x!=0);
+  #endif
+}
+
+
+/* --------------------------------------------------------------------------------
+  find least/most significant bit position
+-------------------------------------------------------------------------------- */
+
+// Find most significant bit index, or MI_SIZE_BITS if 0
+static inline size_t mi_find_msb(size_t x) {
+  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    #if MI_SIZE_BITS==32
+      return (_BitScanReverse(&i, x) ? i : 32);
+    #else
+      return (_BitScanReverse64(&i, x) ? i : 64);
+    #endif
+  #else
+    return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x));
+  #endif
+}
+
+// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's)
+static inline size_t mi_find_lsb(size_t x) {
+  return mi_ctz(x);
+}
+
+
+/* --------------------------------------------------------------------------------
+  rotate
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_rotr(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateright64)(x,r);
+  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateright32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    #if MI_BFIELD_SIZE==4
+    return _lrotr(x,(int)r);
+    #else
+    return _rotr64(x,(int)r);
+    #endif
+  #else
+    // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return (x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1)));
+  #endif
+}
+
+static inline size_t mi_rotl(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateleft64)(x,r);
+  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    #if MI_SIZE_BITS==32
+    return _lrotl(x,(int)r);
+    #else
+    return _rotl64(x,(int)r);
+    #endif
+  #else
+    // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return (x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1)))
+  #endif
+}
+
+#endif // MI_BITS_H
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 716386d2..b997099e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -16,6 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "types.h"
 #include "track.h"
+#include "bits.h"
 
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
@@ -23,26 +24,28 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_trace_message(...)
 #endif
 
-#define MI_CACHE_LINE          64
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
-#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#define mi_decl_align(a)        __declspec(align(a))
 #define mi_decl_weak
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
-#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#define mi_decl_align(a)        __attribute__((aligned(a)))
 #define mi_decl_weak            __attribute__((weak))
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align
+#define mi_decl_align(a)
 #define mi_decl_weak
 #endif
 
+#define mi_decl_cache_align     mi_decl_align(64)
+
+
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
 #define __wasi__
 #endif
@@ -89,6 +92,7 @@ void       _mi_thread_done(mi_heap_t* heap);
 void       _mi_thread_data_collect(void);
 void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+size_t      _mi_thread_seq_id(void) mi_attr_noexcept;
 mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 void       _mi_heap_guarded_init(mi_heap_t* heap);
@@ -96,6 +100,7 @@ void       _mi_heap_guarded_init(mi_heap_t* heap);
 // os.c
 void       _mi_os_init(void);                                            // called from process init
 void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
+void*      _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
 void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
 void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
 
@@ -675,15 +680,6 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) {
   return (idxp == idxq);
 }
 
-static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
-}
-static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
-}
-
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
   void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
   return (p==null ? NULL : p);
@@ -821,112 +817,6 @@ static inline size_t _mi_os_numa_node_count(void) {
 }
 
 
-
-// -----------------------------------------------------------------------
-// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
-// -----------------------------------------------------------------------
-
-#if defined(__GNUC__)
-
-#include <limits.h>       // LONG_MAX
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_clzl(x);
-#else
-  return __builtin_clzll(x);
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_ctzl(x);
-#else
-  return __builtin_ctzll(x);
-#endif
-}
-
-#elif defined(_MSC_VER)
-
-#include <limits.h>       // LONG_MAX
-#include <intrin.h>       // BitScanReverse64
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanReverse(&idx, x);
-#else
-  _BitScanReverse64(&idx, x);
-#endif
-  return ((MI_INTPTR_BITS - 1) - idx);
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanForward(&idx, x);
-#else
-  _BitScanForward64(&idx, x);
-#endif
-  return idx;
-}
-
-#else
-static inline size_t mi_ctz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const unsigned char debruijn[32] = {
-    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
-  };
-  if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
-}
-static inline size_t mi_clz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const uint8_t debruijn[32] = {
-    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
-    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
-  };
-  if (x==0) return 32;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
-}
-
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_clz32((uint32_t)x);
-#else
-  size_t count = mi_clz32((uint32_t)(x >> 32));
-  if (count < 32) return count;
-  return (32 + mi_clz32((uint32_t)x));
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_ctz32((uint32_t)x);
-#else
-  size_t count = mi_ctz32((uint32_t)x);
-  if (count < 32) return count;
-  return (32 + mi_ctz32((uint32_t)(x>>32)));
-#endif
-}
-
-#endif
-
-// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
-}
-
-
 // ---------------------------------------------------------------------------------
 // Provide our own `_mi_memcpy` for potential performance optimizations.
 //
@@ -947,20 +837,20 @@ static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
     memcpy(dst, src, n);
   }
 }
-static inline void _mi_memzero(void* dst, size_t n) {
+static inline void _mi_memset(void* dst, int val, size_t n) {
   if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
-    __stosb((unsigned char*)dst, 0, n);
+    __stosb((unsigned char*)dst, (uint8_t)val, n);
   }
   else {
-    memset(dst, 0, n);
+    memset(dst, val, n);
   }
 }
 #else
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   memcpy(dst, src, n);
 }
-static inline void _mi_memzero(void* dst, size_t n) {
-  memset(dst, 0, n);
+static inline void _mi_memset(void* dst, int val, size_t n) {
+  memset(dst, val, n);
 }
 #endif
 
@@ -978,10 +868,10 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   _mi_memcpy(adst, asrc, n);
 }
 
-static inline void _mi_memzero_aligned(void* dst, size_t n) {
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
   mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
   void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
-  _mi_memzero(adst, n);
+  _mi_memset(adst, val, n);
 }
 #else
 // Default fallback on `_mi_memcpy`
@@ -990,11 +880,19 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   _mi_memcpy(dst, src, n);
 }
 
-static inline void _mi_memzero_aligned(void* dst, size_t n) {
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
   mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
-  _mi_memzero(dst, n);
+  _mi_memset(dst, val, n);
 }
 #endif
 
+static inline void _mi_memzero(void* dst, size_t n) {
+  _mi_memset(dst, 0, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  _mi_memset_aligned(dst, 0, n);
+}
+
 
 #endif
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 56715df4..8a627438 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -369,7 +369,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()
 
 
-
-
-
 #endif  // MIMALLOC_PRIM_H
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 44074450..e8705991 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -23,6 +23,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "bits.h"     // bit ops, size defines
 #include "atomic.h"   // _Atomic
 
 #ifdef _MSC_VER
@@ -106,61 +107,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_HUGE_PAGE_ABANDON 1
 
 
-// ------------------------------------------------------
-// Platform specific values
-// ------------------------------------------------------
-
-// ------------------------------------------------------
-// Size of a pointer.
-// We assume that `sizeof(void*)==sizeof(intptr_t)`
-// and it holds for all platforms we know of.
-//
-// However, the C standard only requires that:
-//  p == (void*)((intptr_t)p))
-// but we also need:
-//  i == (intptr_t)((void*)i)
-// or otherwise one might define an intptr_t type that is larger than a pointer...
-// ------------------------------------------------------
-
-#if INTPTR_MAX > INT64_MAX
-# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
-#elif INTPTR_MAX == INT64_MAX
-# define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == INT32_MAX
-# define MI_INTPTR_SHIFT (2)
-#else
-#error platform pointers must be 32, 64, or 128 bits
-#endif
-
-#if SIZE_MAX == UINT64_MAX
-# define MI_SIZE_SHIFT (3)
-typedef int64_t  mi_ssize_t;
-#elif SIZE_MAX == UINT32_MAX
-# define MI_SIZE_SHIFT (2)
-typedef int32_t  mi_ssize_t;
-#else
-#error platform objects must be 32 or 64 bits
-#endif
-
-#if (SIZE_MAX/2) > LONG_MAX
-# define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
-#else
-# define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
-#endif
-
-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
-
-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
-
-#define MI_KiB     (MI_ZU(1024))
-#define MI_MiB     (MI_KiB*MI_KiB)
-#define MI_GiB     (MI_MiB*MI_KiB)
-
-
 // ------------------------------------------------------
 // Main internal data-structures
 // ------------------------------------------------------
@@ -202,6 +148,9 @@ typedef int32_t  mi_ssize_t;
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+#define MI_BIN_COUNT (MI_BIN_FULL+1)
+
 
 #if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
 #error "mimalloc internal: define more bins"
@@ -461,8 +410,6 @@ typedef struct mi_page_queue_s {
   size_t     block_size;
 } mi_page_queue_t;
 
-#define MI_BIN_FULL  (MI_BIN_HUGE+1)
-
 // Random context
 typedef struct mi_random_cxt_s {
   uint32_t input[16];
diff --git a/src/bitmap.c b/src/bitmap.c
index 976ba72c..3e6311dc 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -18,6 +18,7 @@ between the fields. (This is used in arena allocation)
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
 #include "bitmap.h"
 
 /* -----------------------------------------------------------
@@ -53,7 +54,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
   const size_t mask = mi_bitmap_mask_(count, 0);
   const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
 
-#ifdef MI_HAVE_FAST_BITSCAN
+#if MI_HAS_FAST_BITSCAN
   size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
 #else
   size_t bitidx = 0;               // otherwise start at 0
@@ -79,7 +80,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
     }
     else {
       // on to the next bit range
-#ifdef MI_HAVE_FAST_BITSCAN
+#if MI_HAS_FAST_BITSCAN
       mi_assert_internal(mapm != 0);
       const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
       mi_assert_internal(shift > 0 && shift <= count);
@@ -146,7 +147,7 @@ static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size
   return ((field & mask) == mask);
 }
 
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
 // Returns `true` if successful when all previous `count` bits were 0.
 bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
@@ -154,9 +155,9 @@ bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count
   const size_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {    
+  do  {
     if ((expected & mask) != 0) return false;
-  } 
+  }
   while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
   mi_assert_internal((expected & mask) == 0);
   return true;
@@ -194,7 +195,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   if (initial == 0)     return false;
   if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
   if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-  
+
   // scan ahead
   size_t found = initial;
   size_t mask = 0;     // mask bits for the final field
diff --git a/src/init.c b/src/init.c
index a90818a4..2544f097 100644
--- a/src/init.c
+++ b/src/init.c
@@ -124,6 +124,18 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
 }
 
+// Thread sequence number
+static _Atomic(size_t)        mi_tcount;
+static mi_decl_thread size_t  mi_tseq;
+
+size_t _mi_thread_seq_id(void) mi_attr_noexcept {
+  size_t tseq = mi_tseq;
+  if (tseq == 0) {
+    mi_tseq = tseq = mi_atomic_add_acq_rel(&mi_tcount,1);
+  }
+  return tseq;
+}
+
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
@@ -169,8 +181,8 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 #if MI_GUARDED
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
   heap->guarded_sample_seed = seed;
-  if (heap->guarded_sample_seed == 0) { 
-    heap->guarded_sample_seed = _mi_heap_random_next(heap); 
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
   }
   heap->guarded_sample_rate  = sample_rate;
   if (heap->guarded_sample_rate >= 1) {
@@ -188,9 +200,9 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
   mi_heap_guarded_set_sample_rate(heap,
     (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
     (size_t)mi_option_get(mi_option_guarded_sample_seed));
-  mi_heap_guarded_set_size_bound(heap, 
+  mi_heap_guarded_set_size_bound(heap,
     (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
-    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );  
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );
 }
 #else
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
@@ -602,7 +614,7 @@ static void mi_detect_cpu_features(void) {
 }
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing 
+  // nothing
 }
 #endif
 
diff --git a/src/libc.c b/src/libc.c
index ce541f1b..05ed7b02 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // --------------------------------------------------------
 // This module defines various std libc functions to reduce
-// the dependency on libc, and also prevent errors caused 
+// the dependency on libc, and also prevent errors caused
 // by some libc implementations when called before `main`
 // executes (due to malloc redirection)
 // --------------------------------------------------------
@@ -83,7 +83,7 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) {
 // Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
 // This is mostly to avoid calling these when libc is not yet
 // initialized (and to reduce dependencies)
-// 
+//
 // format:      d i, p x u, s
 // prec:        z l ll L
 // width:       10
@@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }
 
 
-static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end)
 {
   if (x == 0 || base == 0 || base > 16) {
     if (prefix != 0) { mi_outc(prefix, out, end); }
@@ -144,8 +144,8 @@ static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char*
       mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
       x = x / base;
     }
-    if (prefix != 0) { 
-      mi_outc(prefix, out, end); 
+    if (prefix != 0) {
+      mi_outc(prefix, out, end);
     }
     size_t len = *out - start;
     // and reverse in-place
@@ -181,7 +181,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       size_t width = 0;
       char   numtype = 'd';
       char   numplus = 0;
-      bool   alignright = true; 
+      bool   alignright = true;
       if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
       if (c == '-') { alignright = false; MI_NEXTC(); }
       if (c == '0') { fill = '0'; MI_NEXTC(); }
@@ -191,7 +191,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
           width = (10 * width) + (c - '0'); MI_NEXTC();
         }
         if (c == 0) break;  // extra check due to while
-      }      
+      }
       if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
       else if (c == 'l') {
         numtype = c; MI_NEXTC();
@@ -273,3 +273,56 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
   _mi_vsnprintf(buf, buflen, fmt, args);
   va_end(args);
 }
+
+
+
+// --------------------------------------------------------
+// generic trailing and leading zero count
+// --------------------------------------------------------
+
+static inline size_t mi_ctz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+
+static inline size_t mi_clz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const size_t count = mi_clz_generic32((uint32_t)(x >> 32));
+    if (count < 32) return count;
+    return (32 + mi_clz_generic32((uint32_t)x));
+  #endif
+}
+
+size_t _mi_ctz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const size_t count = mi_ctz_generic32((uint32_t)x);
+    if (count < 32) return count;
+    return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+  #endif
+}
diff --git a/src/os.c b/src/os.c
index a7130b90..36b167cb 100644
--- a/src/os.c
+++ b/src/os.c
@@ -359,6 +359,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   return p;
 }
 
+void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+  void* p = _mi_os_alloc(size, memid, &_mi_stats_main);
+  if (p == NULL) return NULL;
+
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
+  return p;
+}
+
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
   for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
diff --git a/src/page-queue.c b/src/page-queue.c
index 9796f3dc..0a791adb 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -83,9 +83,10 @@ static inline uint8_t mi_bin(size_t size) {
     #if defined(MI_ALIGN4W)
     if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
     #endif
-    wsize--;
-    // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    wsize--; 
+    mi_assert_internal(wsize!=0);
+    // find the highest bit position
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));    
     // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
diff --git a/src/xarena.c b/src/xarena.c
new file mode 100644
index 00000000..42943f84
--- /dev/null
+++ b/src/xarena.c
@@ -0,0 +1,1777 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+In contrast to the rest of mimalloc, the arenas are shared between
+threads and need to be accessed using atomic operations.
+
+Arenas are also used to for huge OS page (1GiB) reservations or for reserving
+OS memory upfront which can be improve performance or is sometimes needed
+on embedded devices. We can also employ this with WASI or `sbrk` systems
+to reserve large arenas upfront and be able to reuse the memory more effectively.
+
+The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "xbitmap.h"
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+#define MI_ARENA_BLOCK_SIZE     (MI_SMALL_PAGE_SIZE)    // 64KiB
+#define MI_ARENA_BLOCK_ALIGN    (MI_ARENA_BLOCK_SIZE)   // 64KiB
+#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
+
+#define MI_ARENA_MIN_OBJ_SIZE   MI_ARENA_BLOCK_SIZE
+#define MI_ARENA_MAX_OBJ_SIZE   (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  mi_arena_id_t       id;                   // arena id; 0 for non-specific
+  mi_memid_t          memid;                // memid of the memory area
+  // _Atomic(uint8_t*)   start;                // the start of the memory area
+  // size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
+  // mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
+  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  int                 numa_node;            // associated NUMA node
+  bool                exclusive;            // only allow allocations if specifically for this arena
+  bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
+  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+
+  mi_bitmap_t         blocks_free;          // is the block free?
+  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
+  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
+  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
+  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+                                            // the full queue contains abandoned full pages
+} mi_arena_t;
+
+#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
+
+// The available arenas
+static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena id's
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  return (int)arena_index + 1;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+  }
+}
+
+size_t mi_arena_get_count(void) {
+  return mi_atomic_load_relaxed(&mi_arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(size_t idx) {
+  mi_assert_internal(idx < mi_arena_get_count());
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+}
+
+
+
+/* -----------------------------------------------------------
+  Util
+----------------------------------------------------------- */
+
+// Blocks needed for a given byte size
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+// Byte size of a number of blocks
+static size_t mi_size_of_blocks(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
+
+// Size of an arena
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_blocks(arena->block_count);
+}
+
+static size_t mi_arena_info_blocks(void) {
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
+  const size_t info_blocks  = mi_block_count_of_size(info_size);
+  return info_blocks;
+}
+
+
+// Start of the arena memory area
+static uint8_t* mi_arena_start(mi_arena_t* arena) {
+  return ((uint8_t*)arena);
+}
+
+// Start of a block
+void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
+  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
+}
+
+// Arena area
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  return mi_arena_start(arena);
+}
+
+
+// Create an arena memid
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = block_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
+}
+
+// returns if the arena is exclusive
+bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *block_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+
+  size_t block_index;
+  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_block_start(arena, block_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // set the dirty bits
+  if (arena->memid.initially_zero) {
+    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
+  }
+
+  // set commit state
+  if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
+
+    bool all_already_committed;
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
+    if (!all_already_committed) {
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
+  }
+
+  return p;
+}
+
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, 
+  size_t size, size_t alignment,
+  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+
+  // Check arena suitability
+  mi_arena_t* arena = mi_arena_from_index(arena_index);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+    else { if (numa_suitable) return NULL; }
+  }
+
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if mi_likely(max_arena == 0) return NULL;
+
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+
+    // try from another numa node instead..
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+        if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  // calc reserve
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
+  }  
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  
+  if (arena_count >= 8 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
+  }
+
+  // check arena bounds
+  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
+  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
+  if (arena_reserve < min_reserve) {
+    arena_reserve = min_reserve;
+  }
+  else if (arena_reserve > max_reserve) {
+    arena_reserve = max_reserve;
+  }
+
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  size_t tseq = _mi_thread_seq_id();
+  *memid = _mi_memid_none();
+
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+          if (p != NULL) return p;
+        }
+      }
+    }
+  }
+
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }
+}
+
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+}
+
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
+
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
+  if (p==NULL) return;
+  if (size==0) return;
+  const bool all_committed = (committed_size == size);
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p, size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t block_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(block_idx < arena->block_count);
+    mi_assert_internal(block_idx > mi_arena_info_blocks());
+    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (arena->memid.is_pinned || arena->memid.initially_committed) {
+      mi_assert_internal(all_committed);
+    }
+    else {
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+        mi_track_mem_noaccess(p, size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
+    }
+
+    // and make it available to others again
+    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
+      if (mi_memkind_is_os(arena->memid.memkind)) {
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
+      }      
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) { *arena_id = -1; }
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  _mi_stat_counter_increase(&stats->arena_count,1);
+  arena->id = mi_arena_id_create(i);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
+  return true;
+}
+
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
+  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
+  mi_assert(start!=NULL);
+  if (start==NULL) return false;
+  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
+    // todo: use alignment in memid to align to blocksize first?
+    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
+    return false;
+  }
+
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+
+  const size_t info_blocks = mi_arena_info_blocks();
+  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
+  if (bcount < info_blocks+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
+    return false;
+  }
+  if (bcount > MI_BITMAP_MAX_BITS) {
+    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
+    return false;
+  }
+  mi_arena_t* arena = (mi_arena_t*)start;
+
+  // commit & zero if needed
+  bool is_zero = memid.initially_zero;
+  if (!memid.initially_committed) {
+    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
+  }
+  if (!is_zero) {
+    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
+  }
+
+  // init
+  arena->id           = _mi_arena_id_none();
+  arena->memid        = memid;
+  arena->exclusive    = exclusive;
+  arena->block_count  = bcount;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->purge_expire = 0;
+  mi_lock_init(&arena->abandoned_visit_lock);
+
+  // init bitmaps
+  mi_bitmap_init(&arena->blocks_free,true);
+  mi_bitmap_init(&arena->blocks_committed,true);
+  mi_bitmap_init(&arena->blocks_dirty,true);
+  mi_bitmap_init(&arena->blocks_purge,true);
+  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    mi_bitmap_init(&arena->blocks_abandoned[i],true);
+  }
+
+  // reserve our meta info (and reserve blocks outside the memory area)
+  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
+  if (memid.initially_committed) {
+    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
+  }
+  else {
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
+  }
+  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
+
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+}
+
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  return 0;
+}
+
+
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
+  size_t bit_set_count = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {   
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    if (is_set) bit_set_count++;
+    buf[bit] = (is_set ? 'x' : '.');
+  }
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bit_count = 0;
+  size_t bit_set_count = 0;
+  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
+    char buf[MI_BITMAP_CHUNK_BITS + 1];
+    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
+    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+      if (bit_count < block_count) {
+        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
+      }
+      else {
+        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
+      }
+      bit_count += MI_BFIELD_BITS;
+    }
+    buf[MI_BITMAP_CHUNK_BITS] = 0;
+    _mi_verbose_message("%s  %s\n", prefix, buf);
+  }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  return bit_set_count;
+}
+
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+  MI_UNUSED(show_abandoned);
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t free_total = 0;
+  size_t block_total = 0;
+  //size_t abandoned_total = 0;
+  size_t purge_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    block_total += arena->block_count;
+    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
+    }
+    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);    
+    // todo: abandoned blocks
+    if (show_purge) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
+    }
+  }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
+  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_free is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {  
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_size_of_blocks(blocks);
+  void* const p = mi_arena_block_start(arena, block_idx);
+  bool needs_recommit;
+  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+  }
+
+  // clear the purged blocks
+  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
+
+  // update committed bitmap
+  if (needs_recommit) {
+    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+  }
+}
+
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, block_idx, blocks, stats);
+  }
+  else {
+    // schedule decommit
+    _mi_error_message(EFAULT, "purging not yet implemented\n");
+  }
+}
+
+
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  _mi_error_message(EFAULT, "purging not yet implemented\n");
+  MI_UNUSED(stats);
+  MI_UNUSED(visit_all);
+  MI_UNUSED(force);
+}
+
+
+#if 0
+
+#define MI_IN_ARENA_C
+#include "arena-abandon.c"
+#undef MI_IN_ARENA_C
+
+/* -----------------------------------------------------------
+  Arena id's
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  return (int)arena_index + 1;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+  }
+}
+
+size_t mi_arena_get_count(void) {
+  return mi_atomic_load_relaxed(&mi_arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(size_t idx) {
+  mi_assert_internal(idx < mi_arena_get_count());
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+}
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a (currently) 16-bit memory id where the
+  lower 8 bits are the arena id, and the upper bits the block index.
+----------------------------------------------------------- */
+
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+static size_t mi_size_of_blocks(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
+
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_blocks(arena->block_count);
+}
+
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = bitmap_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
+}
+
+bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *bitmap_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
+}
+
+
+
+/* -----------------------------------------------------------
+  Special static area for mimalloc internal structures
+  to avoid OS calls (for example, for the arena metadata (~= 256b))
+----------------------------------------------------------- */
+
+#define MI_ARENA_STATIC_MAX  ((MI_INTPTR_SIZE/2)*MI_KiB)  // 4 KiB on 64-bit
+
+static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
+static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
+
+static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
+  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
+  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
+
+  // try to claim space
+  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
+  const size_t oversize = size + alignment - 1;
+  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
+  size_t top = oldtop + oversize;
+  if (top > MI_ARENA_STATIC_MAX) {
+    // try to roll back, ok if this fails
+    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
+    return NULL;
+  }
+
+  // success
+  *memid = _mi_memid_create(MI_MEM_STATIC);
+  memid->initially_zero = true;
+  const size_t start = _mi_align_up(oldtop, alignment);
+  uint8_t* const p = &mi_arena_static[start];
+  _mi_memzero_aligned(p, size);
+  return p;
+}
+
+void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
+
+  // try static
+  void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid);
+  if (p != NULL) return p;
+
+  // or fall back to the OS
+  p = _mi_os_alloc(size, memid, &_mi_stats_main);
+  if (p == NULL) return NULL;
+
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
+  return p;
+}
+
+void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    _mi_os_free(p, size, memid, &_mi_stats_main);
+  }
+  else {
+    mi_assert(memid.memkind == MI_MEM_STATIC);
+  }
+}
+
+void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
+  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
+}
+
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+
+// claim the `blocks_inuse` bits
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
+{
+  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
+    return true;
+  };
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+
+  mi_bitmap_index_t bitmap_index;
+  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_block_start(arena, bitmap_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // none of the claimed blocks should be scheduled for a decommit
+  if (arena->blocks_purge != NULL) {
+    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
+    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
+  }
+
+  // set the dirty bits (todo: no need for an atomic op here?)
+  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  }
+
+  // set commit state
+  if (arena->blocks_committed == NULL) {
+    // always committed
+    memid->initially_committed = true;
+  }
+  else if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
+    bool any_uncommitted;
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+
+  return p;
+}
+
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(alignment <= MI_SEGMENT_ALIGN);
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+
+  // Check arena suitability
+  mi_arena_t* arena = mi_arena_from_index(arena_index);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+                    else { if (numa_suitable) return NULL; }
+  }
+
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
+                                                  bool commit, bool allow_large,
+                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if mi_likely(max_arena == 0) return NULL;
+
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
+    }
+
+    // try from another numa node instead..
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+        if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
+  }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
+  if (arena_count >= 8 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 );
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
+  }
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = _mi_memid_none();
+
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
+
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+          if (p != NULL) return p;
+        }
+      }
+    }
+  }
+
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }
+}
+
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+}
+
+
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  return arena->start;
+}
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_committed != NULL);
+  mi_assert_internal(arena->blocks_purge != NULL);
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_size_of_blocks(blocks);
+  void* const p = mi_arena_block_start(arena, bitmap_idx);
+  bool needs_recommit;
+  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+  }
+
+  // clear the purged blocks
+  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
+  // update committed bitmap
+  if (needs_recommit) {
+    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+  }
+}
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_purge != NULL);
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);
+  }
+  else {
+    // schedule decommit
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+    if (expire != 0) {
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
+    }
+    else {
+      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+    }
+    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
+  }
+}
+
+// purge a range of blocks
+// return true if the full range was purged.
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startseqx + bitlen;
+  size_t bitseqx = startseqx;
+  bool all_purged = false;
+  while (bitseqx < endidx) {
+    // count consecutive ones in the purge mask
+    size_t count = 0;
+    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
+      count++;
+    }
+    if (count > 0) {
+      // found range to be purged
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
+      mi_arena_purge(arena, range_idx, count, stats);
+      if (count == bitlen) {
+        all_purged = true;
+      }
+    }
+    bitseqx += (count+1); // +1 to skip the zero bit (or end)
+  }
+  return all_purged;
+}
+
+// returns true if anything was purged
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
+{
+  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (expire == 0) return false;
+  if (!force && expire > now) return false;
+
+  // reset expire (if not already set concurrently)
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+
+  // potential purges scheduled, walk through the bitmap
+  bool any_purged = false;
+  bool full_purge = true;
+  for (size_t i = 0; i < arena->field_count; i++) {
+    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
+    if (purge != 0) {
+      size_t bitseqx = 0;
+      while (bitseqx < MI_BITMAP_FIELD_BITS) {
+        // find consecutive range of ones in the purge mask
+        size_t bitlen = 0;
+        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
+          bitlen++;
+        }
+        // temporarily claim the purge range as "in-use" to be thread-safe with allocation
+        // try to claim the longest range of corresponding in_use bits
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
+        while( bitlen > 0 ) {
+          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
+            break;
+          }
+          bitlen--;
+        }
+        // actual claimed bits at `in_use`
+        if (bitlen > 0) {
+          // read purge again now that we have the in_use bits
+          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
+          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
+            full_purge = false;
+          }
+          any_purged = true;
+          // release the claimed `in_use` bits again
+          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
+        }
+        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitseqx
+    } // purge != 0
+  }
+  // if not fully purged, make sure to purge again in the future
+  if (!full_purge) {
+    const long delay = mi_arena_purge_delay();
+    mi_msecs_t expected = 0;
+    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
+  }
+  return any_purged;
+}
+
+static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  // allow only one thread to purge at a time
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard)
+  {
+    mi_msecs_t now = _mi_clock_now();
+    size_t max_purge_count = (visit_all ? max_arena : 1);
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+      if (arena != NULL) {
+        if (mi_arena_try_purge(arena, now, force, stats)) {
+          if (max_purge_count <= 1) break;
+          max_purge_count--;
+        }
+      }
+    }
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
+  if (p==NULL) return;
+  if (size==0) return;
+  const bool all_committed = (committed_size == size);
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p,size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t bitmap_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed);
+    }
+    else {
+      mi_assert_internal(arena->blocks_committed != NULL);
+      mi_assert_internal(arena->blocks_purge != NULL);
+
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+        mi_track_mem_noaccess(p,size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
+    }
+
+    // and make it available to others again
+    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
+      }
+      else {
+        new_max_arena = i;
+      }
+      _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size);
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) { *arena_id = -1; }
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  _mi_stat_counter_increase(&stats->arena_count,1);
+  arena->id = mi_arena_id_create(i);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
+  return true;
+}
+
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
+  if (is_large) {
+    mi_assert_internal(memid.initially_committed && memid.is_pinned);
+  }
+
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
+  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
+  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
+  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
+  mi_memid_t meta_memid;
+  mi_arena_t* arena   = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid);
+  if (arena == NULL) return false;
+
+  // already zero'd due to zalloc
+  // _mi_memzero(arena, asize);
+  arena->id = _mi_arena_id_none();
+  arena->memid = memid;
+  arena->exclusive = exclusive;
+  arena->meta_size = asize;
+  arena->meta_memid = meta_memid;
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)start;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->purge_expire = 0;
+  arena->search_idx   = 0;
+  mi_lock_init(&arena->abandoned_visit_lock);
+  // consecutive bitmaps
+  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
+  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
+  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
+  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+
+  // and claim leftover blocks if needed (so we never allocate there)
+  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  mi_assert_internal(post >= 0);
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
+  }
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+
+}
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  return 0;
+}
+
+
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bcount = 0;
+  size_t inuse_count = 0;
+  for (size_t i = 0; i < field_count; i++) {
+    char buf[MI_BITMAP_FIELD_BITS + 1];
+    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
+      if (bcount < block_count) {
+        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+        if (inuse) inuse_count++;
+        buf[bit] = (inuse ? 'x' : '.');
+      }
+      else {
+        buf[bit] = ' ';
+      }
+    }
+    buf[MI_BITMAP_FIELD_BITS] = 0;
+    _mi_verbose_message("%s  %s\n", prefix, buf);
+  }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
+  return inuse_count;
+}
+
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t inuse_total = 0;
+  size_t abandoned_total = 0;
+  size_t purge_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
+    }
+    if (arena->blocks_committed != NULL) {
+      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
+    }
+    if (show_abandoned) {
+      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);
+    }
+    if (show_purge && arena->blocks_purge != NULL) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
+    }
+  }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
+  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/xbitmap.c b/src/xbitmap.c
new file mode 100644
index 00000000..68525c84
--- /dev/null
+++ b/src/xbitmap.c
@@ -0,0 +1,599 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
+#include "xbitmap.h"
+
+/* --------------------------------------------------------------------------------
+  bfields
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_bfield_ctz(mi_bfield_t x) {
+  return mi_ctz(x);
+}
+
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
+
+// find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x,idx);
+}
+
+static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+  return mi_rotr(x,r);
+}
+
+// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
+static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  if (set) {
+    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
+    return ((old&mask) == 0);
+  }
+  else {
+    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
+    return ((old&mask) == mask);
+  }
+}
+
+// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
+// `already_xset` is true if all bits for the mask were already set/cleared.
+static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+    *already_xset = ((old&mask) == mask);
+    return ((old&mask) == 0);
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+    *already_xset = ((old&mask) == 0);
+    return ((old&mask) == mask);
+  }
+}
+
+// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
+  return mi_bfield_atomic_xset(set, b, idx);
+}
+
+
+// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != 0) return false; // the mask bits are no longer 0
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
+    return true;
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != mask) return false; // the mask bits are no longer set
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
+    return true;
+  }
+}
+
+// Check if all bits corresponding to a mask are set/cleared.
+static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    return ((*b & mask) == mask);
+  }
+  else {
+    return ((*b & mask) == 0);
+  }
+}
+
+// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_xset_mask(set,b,mask);
+}
+
+
+/* --------------------------------------------------------------------------------
+ bitmap chunks
+-------------------------------------------------------------------------------- */
+
+static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
+  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+  const size_t i   = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
+}
+
+static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
+  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
+  const size_t i         = byte_idx / MI_BFIELD_SIZE;
+  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
+}
+
+// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_transition = true;
+  bool all_already_xset = true;
+  size_t idx   = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;  
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    bool already_xset;
+    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
+    all_already_xset = all_already_xset && already_xset;
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  *palready_xset = all_already_xset;
+  return all_transition;
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_xset = true;
+  size_t idx = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  return all_xset;
+}
+
+// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving all bit fields as is.
+static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  size_t start_idx = cidx % MI_BFIELD_BITS;
+  size_t start_field = cidx / MI_BFIELD_BITS;
+  size_t end_field = MI_BITMAP_CHUNK_FIELDS; 
+  size_t mask_mid = 0;
+  size_t mask_end = 0; 
+  
+  // first field  
+  size_t field = start_field;
+  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
+  if (m > n) { m = n; }
+  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
+  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
+  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
+  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false; 
+  
+  // done?
+  n -= m;
+  if (n==0) return true;
+
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
+
+  // mid fields
+  while (n >= MI_BFIELD_BITS) {
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mask_mid = ~MI_ZU(0);
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
+    n -= MI_BFIELD_BITS;
+  }
+
+  // last field
+  if (n > 0) {    
+    mi_assert_internal(n < MI_BFIELD_BITS);
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    end_field = field;
+    mask_end = (MI_ZU(1)<<n)-1;
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
+  }
+
+  return true;
+  
+restore:
+  // field is on the field that failed to set atomically; we need to restore all previous fields
+  mi_assert_internal(field > start_field);
+  while( field > start_field) {
+    field--;
+    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
+    bool already_xset;
+    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
+  }
+  return false;
+}
+
+
+// find least 1-bit in a chunk and try unset it atomically
+// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
+    mi_assert_internal(mask != 0);
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    size_t cidx;
+    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
+        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+    // try again
+  }
+  #else
+  size_t idx;
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    size_t idx;
+    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
+        *pidx = (i*MI_BFIELD_BITS + idx);
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+  }
+  return false;
+  #endif
+}
+
+
+// find least byte in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
+    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
+    if (mask == 0) return false;
+    const size_t i = _tzcnt_u32(mask);
+    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
+    const size_t chunk_idx = i / MI_BFIELD_SIZE;
+    const size_t byte_idx  = i % MI_BFIELD_SIZE;
+    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
+      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+      return true;
+    }
+    // try again
+  }
+  #else
+    size_t idx;
+    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+      const mi_bfield_t x = chunk->bfields[i];
+      // has_set8 has low bit in each byte set if the byte in x == 0xFF
+      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+                                    >> 7;                           // shift high bit to low bit
+      size_t idx;
+      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
+        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+        mi_assert_internal((idx%8)==0);
+        const size_t byte_idx = idx/8;
+        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        // else continue
+      }
+    }
+    return false;
+  #endif
+}
+
+
+// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
+// todo: try avx2 and neon version
+// todo: allow spanning across bfield boundaries?
+static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
+  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    mi_bfield_t b = chunk->bfields[i];
+    size_t bshift = 0;
+    size_t idx;
+    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      b >>= idx;
+      bshift += idx;
+      if (bshift + n >= MI_BFIELD_BITS) break;
+
+      if ((b&mask) == mask) { // found a match
+        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
+        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
+          *pidx = (i*MI_BFIELD_BITS) + bshift;
+          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        else {
+          // if failed to atomically commit, try again from this position
+          b = (chunk->bfields[i] >> bshift);
+        }
+      }
+      else {
+        // advance
+        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
+        mi_assert_internal(ones>0);
+        bshift += ones;
+        b >>= ones;
+      }
+    }
+  }
+  return false;
+}
+
+
+// are all bits in a bitmap chunk set?
+static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_test_all_ones(vec);
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x & chunk->bfields[i];
+  }
+  return (~x == 0);
+  #endif
+}
+
+// are all bits in a bitmap chunk clear?
+static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_testz_si256( vec, vec );
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x | chunk->bfields[i];
+  }
+  return (x == 0);
+  #endif
+}
+
+/* --------------------------------------------------------------------------------
+ bitmap
+-------------------------------------------------------------------------------- */
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
+  if (!already_zero) {
+    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
+  }
+}
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
+
+  // first chunk
+  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
+  if (m > n) { m = n; }
+  bool already_xset;
+  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
+
+  // n can be large so use memset for efficiency for all in-between chunks
+  chunk_idx++;
+  n -= m;
+  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
+  if (mid_chunks > 0) {
+    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
+    chunk_idx += mid_chunks;
+    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
+  }
+
+  // last chunk
+  if (n > 0) {
+    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
+  }
+}
+
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
+  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
+}
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  mi_assert_internal(idx%8 == 0);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
+  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
+  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  bool local_already_xset;
+  if (already_xset==NULL) { already_xset = &local_already_xset;  }
+  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
+  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
+}
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+
+#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
+  { size_t _set_idx; \
+    size_t _start = start % MI_BFIELD_BITS; \
+    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
+    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
+      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
+
+#define mi_bitmap_forall_set_chunks_end() \
+      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
+      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
+      _any_set >>= 1; \
+    } \
+  }
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
+      return true;
+    }
+    else {
+      // we may find that all are unset only on a second iteration but that is ok as
+      // _any_set is a conservative approximation.
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
+  }
+  mi_bitmap_forall_set_chunks_end();
+  return false;
+}
+
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
+      mi_assert_internal((*pidx % 8) == 0);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
+  }
+  mi_bitmap_forall_set_chunks_end();
+  return false;
+}
+
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
+  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
+  // TODO: allow spanning across chunk boundaries
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
+  }
+  mi_bitmap_forall_set_chunks_end();
+  return false;
+}
diff --git a/src/xbitmap.h b/src/xbitmap.h
new file mode 100644
index 00000000..869db2a2
--- /dev/null
+++ b/src/xbitmap.h
@@ -0,0 +1,94 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_XBITMAP_H
+#define MI_XBITMAP_H
+
+/* --------------------------------------------------------------------------------
+  Definitions
+-------------------------------------------------------------------------------- */
+
+typedef size_t mi_bfield_t;
+
+#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
+#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
+#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
+
+#define MI_BITMAP_CHUNK_BITS_SHIFT         (8)                                // 2^8 = 256 bits per chunk
+#define MI_BITMAP_CHUNK_BITS               (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
+#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
+#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
+
+typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
+} mi_bitmap_chunk_t;
+
+
+typedef mi_decl_align(32) struct mi_bitmap_s {
+  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
+  _Atomic(mi_bfield_t)any_set;
+} mi_bitmap_t;
+
+#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
+
+/* --------------------------------------------------------------------------------
+  Bitmap
+-------------------------------------------------------------------------------- */
+
+typedef bool  mi_bit_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
+
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. 
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
+
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
+
+#endif // MI_XBITMAP_H
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 4ad76d6a..a8e30f69 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -7,6 +7,8 @@
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.
 
+static void mi_bins(void);
+
 static void double_free1();
 static void double_free2();
 static void corrupt_free();
@@ -33,7 +35,7 @@ int main() {
   // corrupt_free();
   // block_overflow1();
   // block_overflow2();
-  test_canary_leak();
+  // test_canary_leak();
   // test_aslr();
   // invalid_free();
   // test_reserved();
@@ -41,6 +43,9 @@ int main() {
   // test_heap_walk();
   // alloc_huge();
 
+  mi_bins();
+
+
   void* p1 = malloc(78);
   void* p2 = malloc(24);
   free(p1);
@@ -73,7 +78,7 @@ int main() {
 
 static void invalid_free() {
   free((void*)0xBADBEEF);
-  realloc((void*)0xBADBEEF,10);
+  realloc((void*)0xBADBEEF, 10);
 }
 
 static void block_overflow1() {
@@ -171,7 +176,7 @@ static void test_process_info(void) {
   size_t peak_commit = 0;
   size_t page_faults = 0;
   for (int i = 0; i < 100000; i++) {
-    void* p = calloc(100,10);
+    void* p = calloc(100, 10);
     free(p);
   }
   mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
@@ -229,8 +234,8 @@ static void test_heap_walk(void) {
 }
 
 static void test_canary_leak(void) {
-  char* p = mi_mallocn_tp(char,23);
-  for(int i = 0; i < 23; i++) {
+  char* p = mi_mallocn_tp(char, 23);
+  for (int i = 0; i < 23; i++) {
     p[i] = '0'+i;
   }
   puts(p);
@@ -248,15 +253,15 @@ static void test_canary_leak(void) {
 static void test_large_pages(void) {
   mi_memid_t memid;
 
-  #if 0
+#if 0
   size_t pages_reserved;
   size_t page_size;
   uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
   const size_t req_size = pages_reserved * page_size;
-  #else
+#else
   const size_t req_size = 64*MI_MiB;
-  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL);
-  #endif
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL);
+#endif
 
   p[0] = 1;
 
@@ -276,63 +281,16 @@ static void test_large_pages(void) {
 // bin size experiments
 // ------------------------------
 
-#if 0
+#if 1
 #include <stdint.h>
 #include <stdbool.h>
+#include <mimalloc/bits.h>
 
-#define MI_INTPTR_SIZE 8
 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)
 
 #define MI_BIN_HUGE 100
 //#define MI_ALIGN2W
 
-// Bit scan reverse: return the index of the highest bit.
-static inline uint8_t mi_bsr32(uint32_t x);
-
-#if defined(_MSC_VER)
-#include <windows.h>
-#include <intrin.h>
-static inline uint8_t mi_bsr32(uint32_t x) {
-  uint32_t idx;
-  _BitScanReverse((DWORD*)&idx, x);
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-static inline uint8_t mi_bsr32(uint32_t x) {
-  return (31 - __builtin_clz(x));
-}
-#else
-static inline uint8_t mi_bsr32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const uint8_t debruijn[32] = {
-     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
-     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
-  };
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  x++;
-  return debruijn[(x*0x076be629) >> 27];
-}
-#endif
-
-/*
-// Bit scan reverse: return the index of the highest bit.
-uint8_t _mi_bsr(uintptr_t x) {
-  if (x == 0) return 0;
-  #if MI_INTPTR_SIZE==8
-  uint32_t hi = (x >> 32);
-  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
-  #elif MI_INTPTR_SIZE==4
-  return mi_bsr32(x);
-  #else
-  # error "define bsr for non-32 or 64-bit platforms"
-  #endif
-}
-*/
-
 
 static inline size_t _mi_wsize_from_size(size_t size) {
   return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
@@ -370,7 +328,9 @@ extern inline uint8_t _mi_bin8(size_t size) {
 #endif
     wsize--;
     // find the highest bit
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx; 
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
     // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
@@ -402,44 +362,79 @@ static inline uint8_t _mi_bin4(size_t size) {
     bin = MI_BIN_HUGE;
   }
   else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
     bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
   }
   return bin;
 }
 
-static size_t _mi_binx4(size_t bsize) {
-  if (bsize==0) return 0;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
-  if (b <= 1) return bsize;
-  size_t bin = ((b << 1) | (bsize >> (b - 1))&0x01);
+static size_t _mi_binx4(size_t wsize) {
+  size_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
+    if (b <= 1) return wsize;
+    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
+  }
   return bin;
 }
 
 static size_t _mi_binx8(size_t bsize) {
   if (bsize<=1) return bsize;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
+  size_t idx;
+  mi_bsr(bsize, &idx);
+  uint8_t b = (uint8_t)idx;
   if (b <= 2) return bsize;
   size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
   return bin;
 }
 
+
+static inline size_t mi_bin(size_t wsize) {
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    wsize--;
+    assert(wsize>0);
+    // find the highest bit
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
+    
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+  }
+  return bin;
+}
+
+
 static void mi_bins(void) {
   //printf("  QNULL(1), /* 0 */ \\\n  ");
   size_t last_bin = 0;
-  size_t min_bsize = 0;
-  size_t last_bsize = 0;
-  for (size_t bsize = 1; bsize < 2*1024; bsize++) {
-    size_t size = bsize * 64 * 1024;
-    size_t bin = _mi_binx8(bsize);
+  for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) {
+    size_t bin = mi_bin(wsize);
     if (bin != last_bin) {
-      printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_bsize, last_bsize, last_bin);
-      //printf("QNULL(%6zd), ", wsize);
-      //if (last_bin%8 == 0) printf("/* %i */ \\\n  ", last_bin);
+      //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin);
+      printf("QNULL(%6zd), ", wsize-1);
+      if (last_bin%8 == 0) printf("/* %zu */ \\\n  ", last_bin);
       last_bin = bin;
-      min_bsize = bsize;
     }
-    last_bsize = bsize;
   }
 }
 #endif

From 441d4fed9fd302bb2a2b326bc8b134c8a15982bb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 10:40:18 -0800
Subject: [PATCH 002/264] wip: further progress on removing segments

---
 CMakeLists.txt                      |   1 +
 ide/vs2022/mimalloc.vcxproj         |  12 +-
 ide/vs2022/mimalloc.vcxproj.filters |  13 +-
 include/mimalloc/bits.h             |   6 +
 include/mimalloc/internal.h         | 183 +++---
 include/mimalloc/types.h            | 271 +++-----
 src/alloc.c                         |   2 +-
 src/{xarena.c => arena-old.c}       | 875 ++------------------------
 src/arena.c                         | 871 ++++++++++++++++++++++++--
 src/bitmap-old.c                    | 419 +++++++++++++
 src/bitmap-old.h                    | 110 ++++
 src/bitmap.c                        | 940 +++++++++++++++++-----------
 src/bitmap.h                        | 154 ++---
 src/free.c                          | 118 ++--
 src/heap.c                          |   5 +-
 src/os.c                            |  55 +-
 src/page-map.c                      |  90 +++
 src/page.c                          |  67 +-
 src/static.c                        |   3 +-
 src/xbitmap.c                       | 599 ------------------
 src/xbitmap.h                       |  94 ---
 21 files changed, 2396 insertions(+), 2492 deletions(-)
 rename src/{xarena.c => arena-old.c} (53%)
 create mode 100644 src/bitmap-old.c
 create mode 100644 src/bitmap-old.h
 create mode 100644 src/page-map.c
 delete mode 100644 src/xbitmap.c
 delete mode 100644 src/xbitmap.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fc1808e..5cb05840 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ set(mi_sources
     src/options.c
     src/os.c
     src/page.c
+    src/page-map.c
     src/random.c
     src/segment.c
     src/segment-map.c
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 138acf39..3dd7326f 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -214,12 +214,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
     </ClCompile>
@@ -232,6 +227,7 @@
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
     <ClCompile Include="..\..\src\prim\prim.c" />
     <ClCompile Include="..\..\src\prim\windows\prim.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@@ -248,12 +244,8 @@
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
     <ClCompile Include="..\..\src\os.c" />
     <ClCompile Include="..\..\src\stats.c" />
-    <ClCompile Include="..\..\src\xarena.c" />
-    <ClCompile Include="..\..\src\xbitmap.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 48958be1..2eed7e90 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -43,12 +43,6 @@
     <ClCompile Include="..\..\src\random.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\stats.c">
       <Filter>Sources</Filter>
     </ClCompile>
@@ -58,13 +52,10 @@
     <ClCompile Include="..\..\src\free.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\xbitmap.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\xarena.c">
+    <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
   </ItemGroup>
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 642f0f9c..ad7ea3e6 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -100,6 +100,10 @@ typedef int32_t  mi_ssize_t;
 #define __BMI1__  1
 #endif
 
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+
 /* --------------------------------------------------------------------------------
   Builtin's
 -------------------------------------------------------------------------------- */
@@ -310,4 +314,6 @@ static inline size_t mi_rotl(size_t x, size_t r) {
   #endif
 }
 
+
+
 #endif // MI_BITS_H
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index b997099e..2713c0ac 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -108,6 +108,7 @@ size_t     _mi_os_page_size(void);
 size_t     _mi_os_good_alloc_size(size_t size);
 bool       _mi_os_has_overcommit(void);
 bool       _mi_os_has_virtual_reserve(void);
+size_t     _mi_os_virtual_address_bits(void);
 
 bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
 bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
@@ -136,12 +137,11 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
-bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
-void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 
 void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
 
+/*
 typedef struct mi_arena_field_cursor_s { // abstract struct
   size_t         os_list_count;           // max entries to visit in the OS abandoned list
   size_t         start;                   // start arena idx (may need to be wrapped)
@@ -154,27 +154,12 @@ typedef struct mi_arena_field_cursor_s { // abstract struct
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current);
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
 void          _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current);
+*/
 
-// "segment-map.c"
-void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
-void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+// "page-map.c"
+void       _mi_page_map_register(mi_page_t* page);
+void       _mi_page_map_unregister(mi_page_t* page);
 
-// "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
-
-#if MI_HUGE_PAGE_ABANDON
-void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#else
-void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#endif
-
-void       _mi_segments_collect(bool force, mi_segments_tld_t* tld);
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
-bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -226,7 +211,7 @@ void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+// void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 // "libc.c"
@@ -338,8 +323,8 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
 
 
 // Align a pointer upwards
-static inline void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
+static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) {
+  return (uint8_t*)_mi_align_up((uintptr_t)p, alignment);
 }
 
 
@@ -445,68 +430,44 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   return heap->pages_free_direct[idx];
 }
 
-// Segment that contains the pointer
-// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
-// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
-// therefore we align one byte before `p`.
-// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
-static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
-  #if MI_INTPTR_SIZE <= 4
-  return (p==NULL ? NULL : segment);
-  #else
-  return ((intptr_t)segment <= 0 ? NULL : segment);
+
+extern signed char* _mi_page_map;
+
+#define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
+
+static inline mi_page_t* _mi_ptr_page(const void* p) {
+  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_BLOCK_SHIFT;
+  const ptrdiff_t ofs = _mi_page_map[up];
+  #if MI_DEBUG
+  if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
   #endif
+  return (mi_page_t*)((up + ofs - 1) << MI_ARENA_BLOCK_SHIFT);
 }
 
-// Segment belonging to a page
-static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
-  mi_assert_internal(page!=NULL);
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
-  return segment;
-}
 
-// used internally
-static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
-  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */);
-  size_t idx = (size_t)diff >> segment->page_shift;
-  mi_assert_internal(idx < segment->capacity);
-  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
-  return idx;
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  size_t idx = _mi_segment_page_idx_of(segment, p);
-  return &((mi_segment_t*)segment)->pages[idx];
-}
-
-// Quick page start for initialized pages
-static inline uint8_t* mi_page_start(const mi_page_t* page) {
-  mi_assert_internal(page->page_start != NULL);
-  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
-  return page->page_start;
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_ptr_page(void* p) {
-  mi_assert_internal(p!=NULL);
-  return _mi_segment_page_of(_mi_ptr_segment(p), p);
-}
-
-// Get the block size of a page (special case for huge objects)
+// Get the block size of a page 
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
   return page->block_size;
 }
 
-static inline bool mi_page_is_huge(const mi_page_t* page) {
-  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
-                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
-  return page->is_huge;
+// Page start
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE);
+  return (uint8_t*)page + MI_PAGE_INFO_SIZE;
+}
+
+static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
+  if (size) { *size = mi_page_block_size(page) * page->reserved; }
+  return mi_page_start(page);
+}
+
+static inline bool mi_page_is_in_arena(const mi_page_t* page) {
+  return (page->memid.memkind == MI_MEM_ARENA);
+}
+
+static inline bool mi_page_is_singleton(const mi_page_t* page) {
+  return (page->reserved == 1);
 }
 
 // Get the usable block size of a page without fixed padding.
@@ -515,11 +476,6 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
-// size of a segment
-static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_size;
-}
-
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
   return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
@@ -534,10 +490,20 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
   return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
 }
 
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return mi_atomic_load_relaxed(&page->xthread_id);
+}
+
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
-  if (heap != NULL) { page->heap_tag = heap->tag; }
+  if (heap != NULL) { 
+    page->heap_tag  = heap->tag; 
+    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+  }
+  else {
+    mi_atomic_store_release(&page->xthread_id,0);
+  }
 }
 
 // Thread free flag helpers
@@ -576,6 +542,21 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
   return (page->free != NULL);
 }
 
+
+// is the page not yet used up to its reserved space?
+static inline bool mi_page_is_expandable(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->capacity <= page->reserved);
+  return (page->capacity < page->reserved);
+}
+
+
+static inline bool mi_page_is_full(mi_page_t* page) {
+  bool full = (page->reserved == page->used);
+  mi_assert_internal(!full || page->free == NULL);
+  return full;
+}
+
 // is more than 7/8th of a page in use?
 static inline bool mi_page_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
@@ -583,6 +564,15 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
   return (page->reserved - page->used <= frac);
 }
 
+static inline bool mi_page_is_abandoned(mi_page_t* page) {
+  return (mi_page_thread_id(page) == 0);
+}
+
+static inline bool mi_page_is_huge(mi_page_t* page) {
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE);
+}
+
+
 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
   return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
 }
@@ -667,17 +657,8 @@ We also pass a separate `null` value to be used as `NULL` or otherwise
 `(k2<<<k1)+k1` would appear (too) often as a sentinel value.
 ------------------------------------------------------------------- */
 
-static inline bool mi_is_in_same_segment(const void* p, const void* q) {
-  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
-}
-
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segmentp = _mi_ptr_segment(p);
-  mi_segment_t* segmentq = _mi_ptr_segment(q);
-  if (segmentp != segmentq) return false;
-  size_t idxp = _mi_segment_page_idx_of(segmentp, p);
-  size_t idxq = _mi_segment_page_idx_of(segmentq, q);
-  return (idxp == idxq);
+  return (_mi_ptr_page(p) == _mi_ptr_page(q));
 }
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
@@ -693,7 +674,7 @@ static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const
 static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) {
   const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys));
   // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951)
-  #ifdef MI_BIG_ENDIAN
+  #if MI_BIG_ENDIAN
   return (x & 0x00FFFFFF);
   #else
   return (x & 0xFFFFFF00);
@@ -749,6 +730,20 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
   #endif
 }
 
+/* -----------------------------------------------------------
+  arena blocks
+----------------------------------------------------------- */
+
+// Blocks needed for a given byte size
+static inline size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+// Byte size of a number of blocks
+static inline size_t mi_size_of_blocks(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
+
 
 /* -----------------------------------------------------------
   memory id's
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e8705991..98664020 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -111,40 +111,29 @@ terms of the MIT license. A copy of the license can be found in the file
 // Main internal data-structures
 // ------------------------------------------------------
 
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit, divide by two for 32-bit
-#ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
+// Sizes are for 64-bit
+#ifndef MI_ARENA_BLOCK_SHIFT
+#ifdef  MI_SMALL_PAGE_SHIFT  // compatibility
+#define MI_ARENA_BLOCK_SHIFT              MI_SMALL_PAGE_SHIFT
+#else
+#define MI_ARENA_BLOCK_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
-#ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
 #endif
-#ifndef MI_LARGE_PAGE_SHIFT
-#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#endif
-#ifndef MI_SEGMENT_SHIFT
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
+#ifndef MI_BITMAP_CHUNK_BITS_SHIFT
+#define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
 #endif
 
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
+#define MI_ARENA_BLOCK_SIZE               (MI_ZU(1) << MI_ARENA_BLOCK_SHIFT)   
+#define MI_ARENA_BLOCK_ALIGN              (MI_ARENA_BLOCK_SIZE)
+#define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
+#define MI_ARENA_MIN_OBJ_SIZE             MI_ARENA_BLOCK_SIZE
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
 
-#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
-#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
-#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bitmap)
 
-// The max object size are checked to not waste more than 12.5% internally over the page sizes.
-// (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
@@ -152,18 +141,54 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 
 
-#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
-
-// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
-
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_BLOCK_ALIGN)
 
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`) 
+  MI_MEM_ARENA      // allocated from an arena (the usual case) 
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void* base;                       // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
 
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@@ -223,6 +248,10 @@ typedef union mi_page_flags_s {
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
 typedef uintptr_t mi_thread_free_t;
 
+// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
+typedef struct mi_subproc_s mi_subproc_t;
+
+
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
 // `free` for blocks that can be allocated,
@@ -242,8 +271,6 @@ typedef uintptr_t mi_thread_free_t;
 // Notes:
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@@ -252,15 +279,8 @@ typedef uintptr_t mi_thread_free_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  // "owned" by the segment
-  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
-  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
-  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
-
-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  mi_memid_t            memid;             // provenance of the page memory
+  uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
@@ -272,120 +292,54 @@ typedef struct mi_page_s {
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the page area containing the blocks
+  size_t                block_size;        // size available in each block (always `>0`)  
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
+  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
+  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-
-  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
-  void* padding[1];
-  #endif
 } mi_page_t;
 
 
+// ------------------------------------------------------
+// Object sizes
+// ------------------------------------------------------
+
+#define MI_PAGE_ALIGN                     (64)
+#define MI_PAGE_INFO_SIZE                 (MI_SIZE_SHIFT*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // ~16KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // ~128KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // ~2MiB
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
+
+
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
 
 // ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
+// Page kinds
 // ------------------------------------------------------
 
 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
+  MI_PAGE_SINGLETON // page containing a single block. 
                     // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
 
-// ---------------------------------------------------------------
-// a memory id tracks the provenance of arena/OS allocated memory
-// ---------------------------------------------------------------
-
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
-typedef enum mi_memkind_e {
-  MI_MEM_NONE,      // not allocated
-  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
-} mi_memkind_t;
-
-static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
-  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
-}
-
-typedef struct mi_memid_os_info {
-  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
-} mi_memid_os_info_t;
-
-typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
-} mi_memid_arena_info_t;
-
-typedef struct mi_memid_s {
-  union {
-    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
-  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
-  bool          initially_committed;// `true` if the memory was originally allocated as committed
-  bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
-} mi_memid_t;
-
-
-// ---------------------------------------------------------------
-// Segments contain mimalloc pages
-// ---------------------------------------------------------------
-typedef struct mi_subproc_s mi_subproc_t;
-
-// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
-// Inside segments we allocated fixed size _pages_ that contain blocks.
-typedef struct mi_segment_s {
-  // constant fields
-  mi_memid_t           memid;            // memory id to track provenance
-  bool                 allow_decommit;
-  bool                 allow_purge;
-  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  mi_subproc_t*        subproc;          // segment belongs to sub process
-
-  // segment fields
-  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
-  struct mi_segment_s* prev;
-  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
-  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
-
-  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
-
-  size_t               used;             // count of pages in use (`used <= capacity`)
-  size_t               capacity;         // count of available pages (`#free + used`)
-  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
-
-  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
-  struct mi_segment_s* abandoned_os_prev;
-
-  // layout like this to optimize access in `mi_free`
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
-  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
-} mi_segment_t;
-
 
 // ------------------------------------------------------
 // Heaps
@@ -522,21 +476,18 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 
 typedef struct mi_stats_s {
-  mi_stat_count_t segments;
   mi_stat_count_t pages;
   mi_stat_count_t reserved;
   mi_stat_count_t committed;
   mi_stat_count_t reset;
   mi_stat_count_t purged;
   mi_stat_count_t page_committed;
-  mi_stat_count_t segments_abandoned;
   mi_stat_count_t pages_abandoned;
   mi_stat_count_t threads;
   mi_stat_count_t normal;
   mi_stat_count_t huge;
   mi_stat_count_t giant;
   mi_stat_count_t malloc;
-  mi_stat_count_t segments_cache;
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t mmap_calls;
   mi_stat_counter_t commit_calls;
@@ -581,12 +532,12 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 // ------------------------------------------------------
 
 struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
+  _Atomic(size_t)    abandoned_count;         // count of abandoned pages for this sub-process
+  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
+  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
   mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
-  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
+  mi_page_t*         abandoned_os_list;       // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory)
+  mi_page_t*         abandoned_os_list_tail;  // the tail-end of the list
   mi_memid_t         memid;                   // provenance of this memory block
 };
 
@@ -597,11 +548,6 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 
-// Queue of segments
-typedef struct mi_segment_queue_s {
-  mi_segment_t* first;
-  mi_segment_t* last;
-} mi_segment_queue_t;
 
 // OS thread local data
 typedef struct mi_os_tld_s {
@@ -609,28 +555,13 @@ typedef struct mi_os_tld_s {
   mi_stats_t*           stats;        // points to tld stats
 } mi_os_tld_t;
 
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_segment_queue_t  small_free;   // queue of segments with free small pages
-  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
-  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  size_t              reclaim_count;// number of reclaimed (abandoned) segments
-  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
-  mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os tld
-} mi_segments_tld_t;
-
 // Thread local data
 struct mi_tld_s {
   unsigned long long  heartbeat;     // monotonic heartbeat count
   bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
+  mi_subproc_t*       subproc;       // sub-process this thread belongs to.
   mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
 };
diff --git a/src/alloc.c b/src/alloc.c
index a093f108..00f6d1a4 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -82,7 +82,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
 
   #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_increase(heap, normal, bsize);
     mi_heap_stat_counter_increase(heap, normal_count, 1);
     #if (MI_STAT>1)
diff --git a/src/xarena.c b/src/arena-old.c
similarity index 53%
rename from src/xarena.c
rename to src/arena-old.c
index 42943f84..8ca5aaf3 100644
--- a/src/xarena.c
+++ b/src/arena-old.c
@@ -21,834 +21,46 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "xbitmap.h"
+#include "mimalloc/atomic.h"
+#include "bitmap.h"
 
 
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
 
-#define MI_ARENA_BLOCK_SIZE     (MI_SMALL_PAGE_SIZE)    // 64KiB
-#define MI_ARENA_BLOCK_ALIGN    (MI_ARENA_BLOCK_SIZE)   // 64KiB
-#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
-
-#define MI_ARENA_MIN_OBJ_SIZE   MI_ARENA_BLOCK_SIZE
-#define MI_ARENA_MAX_OBJ_SIZE   (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
-  // _Atomic(uint8_t*)   start;                // the start of the memory area
-  // size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
-  // mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
+  _Atomic(uint8_t*)start;                // the start of the memory area
   size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
+  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-
-  mi_bitmap_t         blocks_free;          // is the block free?
-  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
-  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
-  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
-  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
-                                            // the full queue contains abandoned full pages
+  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
+  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
+  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
-#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
+
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
-
-/* -----------------------------------------------------------
-  Arena id's
-  id = arena_index + 1
------------------------------------------------------------ */
-
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
-}
-
-mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
-}
-
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
-}
-
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
-  if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
-  }
-  else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
-  }
-}
-
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
-}
-
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
-}
-
-
-
-/* -----------------------------------------------------------
-  Util
------------------------------------------------------------ */
-
-// Blocks needed for a given byte size
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
-}
-
-// Byte size of a number of blocks
-static size_t mi_size_of_blocks(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
-}
-
-// Size of an arena
-static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
-}
-
-static size_t mi_arena_info_blocks(void) {
-  const size_t os_page_size = _mi_os_page_size();
-  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
-  const size_t info_blocks  = mi_block_count_of_size(info_size);
-  return info_blocks;
-}
-
-
-// Start of the arena memory area
-static uint8_t* mi_arena_start(mi_arena_t* arena) {
-  return ((uint8_t*)arena);
-}
-
-// Start of a block
-void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
-  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
-}
-
-// Arena area
-void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
-  if (size != NULL) *size = 0;
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
-  return mi_arena_start(arena);
-}
-
-
-// Create an arena memid
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
-  memid.mem.arena.block_index = block_index;
-  memid.mem.arena.is_exclusive = is_exclusive;
-  return memid;
-}
-
-// returns if the arena is exclusive
-bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
-  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *block_index = memid.mem.arena.block_index;
-  return memid.mem.arena.is_exclusive;
-}
-
-
-
-/* -----------------------------------------------------------
-  Arena Allocation
------------------------------------------------------------ */
-
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
-  size_t block_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
-
-  // claimed it!
-  void* p = mi_arena_block_start(arena, block_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
-  memid->is_pinned = arena->memid.is_pinned;
-
-  // set the dirty bits
-  if (arena->memid.initially_zero) {
-    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
-  }
-
-  // set commit state
-  if (commit) {
-    // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->initially_committed = true;
-
-    bool all_already_committed;
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
-    if (!all_already_committed) {
-      bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
-        memid->initially_committed = false;
-      }
-      else {
-        if (commit_zero) { memid->initially_zero = true; }
-      }
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
-  }
-
-  return p;
-}
-
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, 
-  size_t size, size_t alignment,
-  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
-
-// try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
-{
-  if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
-  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
-
-  // calc reserve
-  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
-  if (arena_reserve == 0) return false;
-
-  if (!_mi_os_has_virtual_reserve()) {
-    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
-  }  
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
-  
-  if (arena_count >= 8 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
-    size_t reserve = 0;
-    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
-      arena_reserve = reserve;
-    }
-  }
-
-  // check arena bounds
-  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
-  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
-  if (arena_reserve < min_reserve) {
-    arena_reserve = min_reserve;
-  }
-  else if (arena_reserve > max_reserve) {
-    arena_reserve = max_reserve;
-  }
-
-  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-
-  // commit eagerly?
-  bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
-  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
-
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
-}
-
-
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  size_t tseq = _mi_thread_seq_id();
-  *memid = _mi_memid_none();
-
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
-
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
-    }
-  }
-
-  // if we cannot use OS allocation, return NULL
-  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
-    errno = ENOMEM;
-    return NULL;
-  }
-
-  // finally, fall back to the OS
-  if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
-  }
-  else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }
-}
-
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
-}
-
-
-/* -----------------------------------------------------------
-  Arena free
------------------------------------------------------------ */
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
-
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
-  mi_assert_internal(committed_size <= size);
-  if (p==NULL) return;
-  if (size==0) return;
-  const bool all_committed = (committed_size == size);
-
-  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-  mi_track_mem_undefined(p, size);
-
-  if (mi_memkind_is_os(memid.memkind)) {
-    // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-    }
-    _mi_os_free(p, size, memid, stats);
-  }
-  else if (memid.memkind == MI_MEM_ARENA) {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t block_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(block_idx < arena->block_count);
-    mi_assert_internal(block_idx > mi_arena_info_blocks());
-    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-
-    // potentially decommit
-    if (arena->memid.is_pinned || arena->memid.initially_committed) {
-      mi_assert_internal(all_committed);
-    }
-    else {
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
-        mi_track_mem_noaccess(p, size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
-    }
-
-    // and make it available to others again
-    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
-      return;
-    };
-  }
-  else {
-    // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
-  }
-
-  // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t new_max_arena = 0;
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL) {
-      mi_lock_done(&arena->abandoned_visit_lock);
-      if (mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
-      }      
-    }
-  }
-
-  // try to lower the max arena.
-  size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
-}
-
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
-  mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
-}
-
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-/* -----------------------------------------------------------
-  Add an arena.
------------------------------------------------------------ */
-
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
-  mi_assert_internal(arena != NULL);
-  mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
-
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
-  }
-  _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
-  return true;
-}
-
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
-{
-  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
-  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
-  mi_assert(start!=NULL);
-  if (start==NULL) return false;
-  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
-    // todo: use alignment in memid to align to blocksize first?
-    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
-    return false;
-  }
-
-  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
-
-  const size_t info_blocks = mi_arena_info_blocks();
-  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
-  if (bcount < info_blocks+1) {
-    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
-    return false;
-  }
-  if (bcount > MI_BITMAP_MAX_BITS) {
-    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
-    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
-    return false;
-  }
-  mi_arena_t* arena = (mi_arena_t*)start;
-
-  // commit & zero if needed
-  bool is_zero = memid.initially_zero;
-  if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
-  }
-  if (!is_zero) {
-    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
-  }
-
-  // init
-  arena->id           = _mi_arena_id_none();
-  arena->memid        = memid;
-  arena->exclusive    = exclusive;
-  arena->block_count  = bcount;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
-  arena->purge_expire = 0;
-  mi_lock_init(&arena->abandoned_visit_lock);
-
-  // init bitmaps
-  mi_bitmap_init(&arena->blocks_free,true);
-  mi_bitmap_init(&arena->blocks_committed,true);
-  mi_bitmap_init(&arena->blocks_dirty,true);
-  mi_bitmap_init(&arena->blocks_purge,true);
-  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
-    mi_bitmap_init(&arena->blocks_abandoned[i],true);
-  }
-
-  // reserve our meta info (and reserve blocks outside the memory area)
-  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
-  if (memid.initially_committed) {
-    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
-  }
-  else {
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
-  }
-  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
-
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
-}
-
-
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
-  memid.initially_committed = is_committed;
-  memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
-  mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
-  if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
-    return ENOMEM;
-  }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
-  return 0;
-}
-
-
-// Manage a range of regular OS memory
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
-  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
-  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
-}
-
-
-/* -----------------------------------------------------------
-  Debugging
------------------------------------------------------------ */
-static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
-  size_t bit_set_count = 0;
-  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {   
-    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
-    if (is_set) bit_set_count++;
-    buf[bit] = (is_set ? 'x' : '.');
-  }
-  return bit_set_count;
-}
-
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
-  size_t bit_count = 0;
-  size_t bit_set_count = 0;
-  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
-    char buf[MI_BITMAP_CHUNK_BITS + 1];
-    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
-    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
-      if (bit_count < block_count) {
-        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
-      }
-      else {
-        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
-      }
-      bit_count += MI_BFIELD_BITS;
-    }
-    buf[MI_BITMAP_CHUNK_BITS] = 0;
-    _mi_verbose_message("%s  %s\n", prefix, buf);
-  }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
-  return bit_set_count;
-}
-
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  MI_UNUSED(show_abandoned);
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t free_total = 0;
-  size_t block_total = 0;
-  //size_t abandoned_total = 0;
-  size_t purge_total = 0;
-  for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena == NULL) break;
-    block_total += arena->block_count;
-    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
-    if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
-    }
-    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);    
-    // todo: abandoned blocks
-    if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
-    }
-  }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
-  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
-}
-
-
-/* -----------------------------------------------------------
-  Reserve a huge page arena.
------------------------------------------------------------ */
-// reserve at a specific numa node
-int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
-  if (pages==0) return 0;
-  if (numa_node < -1) numa_node = -1;
-  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  size_t hsize = 0;
-  size_t pages_reserved = 0;
-  mi_memid_t memid;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
-  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
-    return ENOMEM;
-  }
-  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
-
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
-    return ENOMEM;
-  }
-  return 0;
-}
-
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
-  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
-}
-
-// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages == 0) return 0;
-
-  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
-  const size_t pages_per = pages / numa_count;
-  const size_t pages_mod = pages % numa_count;
-  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
-
-  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
-    if (err) return err;
-    if (pages < node_pages) {
-      pages = 0;
-    }
-    else {
-      pages -= node_pages;
-    }
-  }
-
-  return 0;
-}
-
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  MI_UNUSED(max_secs);
-  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
-  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
-  return err;
-}
-
-
-
-/* -----------------------------------------------------------
-  Arena purge
------------------------------------------------------------ */
-
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
-
-// reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {  
-  mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
-  void* const p = mi_arena_block_start(arena, block_idx);
-  bool needs_recommit;
-  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
-    // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
-  }
-  else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
-  }
-
-  // clear the purged blocks
-  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
-
-  // update committed bitmap
-  if (needs_recommit) {
-    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
-  }
-}
-
-
-// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
-// Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0) return;  // is purging allowed at all?
-
-  if (_mi_preloading() || delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, block_idx, blocks, stats);
-  }
-  else {
-    // schedule decommit
-    _mi_error_message(EFAULT, "purging not yet implemented\n");
-  }
-}
-
-
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
-
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
-  if (max_arena == 0) return;
-
-  _mi_error_message(EFAULT, "purging not yet implemented\n");
-  MI_UNUSED(stats);
-  MI_UNUSED(visit_all);
-  MI_UNUSED(force);
-}
-
-
-#if 0
-
 #define MI_IN_ARENA_C
 #include "arena-abandon.c"
 #undef MI_IN_ARENA_C
@@ -904,12 +116,12 @@ static size_t mi_block_count_of_size(size_t size) {
   return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
-static size_t mi_size_of_blocks(size_t bcount) {
+static size_t mi_arena_block_size(size_t bcount) {
   return (bcount * MI_ARENA_BLOCK_SIZE);
 }
 
 static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
+  return mi_arena_block_size(arena->block_count);
 }
 
 static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
@@ -995,7 +207,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
 }
 
 void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
+  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
 }
 
 
@@ -1004,7 +216,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
@@ -1056,7 +268,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
         memid->initially_committed = false;
       }
       else {
@@ -1081,7 +293,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
   const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+  mi_assert_internal(size <= mi_arena_block_size(bcount));
 
   // Check arena suitability
   mi_arena_t* arena = mi_arena_from_index(arena_index);
@@ -1227,7 +439,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (arena_index >= MI_MAX_ARENAS) return NULL;
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
   return arena->start;
 }
 
@@ -1247,7 +459,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_committed != NULL);
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
+  const size_t size = mi_arena_block_size(blocks);
   void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
@@ -1299,25 +511,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 // purge a range of blocks
 // return true if the full range was purged.
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startseqx + bitlen;
-  size_t bitseqx = startseqx;
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startidx + bitlen;
+  size_t bitidx = startidx;
   bool all_purged = false;
-  while (bitseqx < endidx) {
+  while (bitidx < endidx) {
     // count consecutive ones in the purge mask
     size_t count = 0;
-    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
+    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
       count++;
     }
     if (count > 0) {
       // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
       mi_arena_purge(arena, range_idx, count, stats);
       if (count == bitlen) {
         all_purged = true;
       }
     }
-    bitseqx += (count+1); // +1 to skip the zero bit (or end)
+    bitidx += (count+1); // +1 to skip the zero bit (or end)
   }
   return all_purged;
 }
@@ -1339,16 +551,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
-      size_t bitseqx = 0;
-      while (bitseqx < MI_BITMAP_FIELD_BITS) {
+      size_t bitidx = 0;
+      while (bitidx < MI_BITMAP_FIELD_BITS) {
         // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
-        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
+        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
           bitlen++;
         }
         // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
         while( bitlen > 0 ) {
           if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
             break;
@@ -1359,15 +571,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         if (bitlen > 0) {
           // read purge again now that we have the in_use bits
           purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
+          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
             full_purge = false;
           }
           any_purged = true;
           // release the claimed `in_use` bits again
           _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
         }
-        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitseqx
+        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitidx
     } // purge != 0
   }
   // if not fully purged, make sure to purge again in the future
@@ -1530,7 +742,7 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
   }
@@ -1606,8 +818,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_assert_internal(post >= 0);
   if (post > 0) {
     // don't use leftover bits at the end
-    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
   }
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
@@ -1774,4 +986,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 }
 
 
-#endif
\ No newline at end of file
diff --git a/src/arena.c b/src/arena.c
index 8ca5aaf3..28ad61f1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -21,7 +21,6 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "bitmap.h"
 
 
@@ -29,38 +28,823 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
   Arena allocation
 ----------------------------------------------------------- */
 
+#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
+
+
 // A memory arena descriptor
 typedef struct mi_arena_s {
-  mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
-  _Atomic(uint8_t*)start;                // the start of the memory area
+  mi_arena_id_t       id;                   // arena id; 0 for non-specific
+
   size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
-  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
-  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+
+  mi_bitmap_t         blocks_free;          // is the block free?
+  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
+  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
+  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
+  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+                                            // the full queue contains abandoned full pages
 } mi_arena_t;
 
-
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
+#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
+
+/* -----------------------------------------------------------
+  Arena id's
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  return (int)arena_index + 1;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+  }
+}
+
+size_t mi_arena_get_count(void) {
+  return mi_atomic_load_relaxed(&mi_arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(size_t idx) {
+  mi_assert_internal(idx < mi_arena_get_count());
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+}
+
+
+
+/* -----------------------------------------------------------
+  Util
+----------------------------------------------------------- */
+
+
+// Size of an arena
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_blocks(arena->block_count);
+}
+
+static size_t mi_arena_info_blocks(void) {
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
+  const size_t info_blocks  = mi_block_count_of_size(info_size);
+  return info_blocks;
+}
+
+
+// Start of the arena memory area
+static uint8_t* mi_arena_start(mi_arena_t* arena) {
+  return ((uint8_t*)arena);
+}
+
+// Start of a block
+void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
+  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
+}
+
+// Arena area
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  return mi_arena_start(arena);
+}
+
+
+// Create an arena memid
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = block_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
+}
+
+// returns if the arena is exclusive
+bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *block_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+
+  size_t block_index;
+  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_block_start(arena, block_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // set the dirty bits
+  if (arena->memid.initially_zero) {
+    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
+  }
+
+  // set commit state
+  if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
+
+    bool all_already_committed;
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
+    if (!all_already_committed) {
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
+  }
+
+  return p;
+}
+
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node,
+  size_t size, size_t alignment,
+  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+
+  // Check arena suitability
+  mi_arena_t* arena = mi_arena_from_index(arena_index);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+    else { if (numa_suitable) return NULL; }
+  }
+
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if mi_likely(max_arena == 0) return NULL;
+
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+
+    // try from another numa node instead..
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+        if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  // calc reserve
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
+  }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+
+  if (arena_count >= 8 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
+  }
+
+  // check arena bounds
+  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
+  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
+  if (arena_reserve < min_reserve) {
+    arena_reserve = min_reserve;
+  }
+  else if (arena_reserve > max_reserve) {
+    arena_reserve = max_reserve;
+  }
+
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  size_t tseq = _mi_thread_seq_id();
+  *memid = _mi_memid_none();
+
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+          if (p != NULL) return p;
+        }
+      }
+    }
+  }
+
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }
+}
+
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+}
+
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
+
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
+  if (p==NULL) return;
+  if (size==0) return;
+  const bool all_committed = (committed_size == size);
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p, size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t block_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(block_idx < arena->block_count);
+    mi_assert_internal(block_idx > mi_arena_info_blocks());
+    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (arena->memid.is_pinned || arena->memid.initially_committed) {
+      mi_assert_internal(all_committed);
+    }
+    else {
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+        mi_track_mem_noaccess(p, size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
+    }
+
+    // and make it available to others again
+    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
+      if (mi_memkind_is_os(arena->memid.memkind)) {
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
+      }
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) { *arena_id = -1; }
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  _mi_stat_counter_increase(&stats->arena_count,1);
+  arena->id = mi_arena_id_create(i);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
+  return true;
+}
+
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
+  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
+  mi_assert(start!=NULL);
+  if (start==NULL) return false;
+  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
+    // todo: use alignment in memid to align to blocksize first?
+    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
+    return false;
+  }
+
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+
+  const size_t info_blocks = mi_arena_info_blocks();
+  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
+  if (bcount < info_blocks+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
+    return false;
+  }
+  if (bcount > MI_BITMAP_MAX_BITS) {
+    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
+    return false;
+  }
+  mi_arena_t* arena = (mi_arena_t*)start;
+
+  // commit & zero if needed
+  bool is_zero = memid.initially_zero;
+  if (!memid.initially_committed) {
+    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
+  }
+  if (!is_zero) {
+    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
+  }
+
+  // init
+  arena->id           = _mi_arena_id_none();
+  arena->memid        = memid;
+  arena->exclusive    = exclusive;
+  arena->block_count  = bcount;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->purge_expire = 0;
+  mi_lock_init(&arena->abandoned_visit_lock);
+
+  // init bitmaps
+  mi_bitmap_init(&arena->blocks_free,true);
+  mi_bitmap_init(&arena->blocks_committed,true);
+  mi_bitmap_init(&arena->blocks_dirty,true);
+  mi_bitmap_init(&arena->blocks_purge,true);
+  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    mi_bitmap_init(&arena->blocks_abandoned[i],true);
+  }
+
+  // reserve our meta info (and reserve blocks outside the memory area)
+  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
+  if (memid.initially_committed) {
+    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
+  }
+  else {
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
+  }
+  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
+
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+}
+
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_BLOCK_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  return 0;
+}
+
+
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
+  size_t bit_set_count = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    if (is_set) bit_set_count++;
+    buf[bit] = (is_set ? 'x' : '.');
+  }
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bit_count = 0;
+  size_t bit_set_count = 0;
+  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
+    char buf[MI_BITMAP_CHUNK_BITS + 1];
+    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
+    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+      if (bit_count < block_count) {
+        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
+      }
+      else {
+        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
+      }
+      bit_count += MI_BFIELD_BITS;
+    }
+    buf[MI_BITMAP_CHUNK_BITS] = 0;
+    _mi_verbose_message("%s  %s\n", prefix, buf);
+  }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  return bit_set_count;
+}
+
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+  MI_UNUSED(show_abandoned);
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t free_total = 0;
+  size_t block_total = 0;
+  //size_t abandoned_total = 0;
+  size_t purge_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    block_total += arena->block_count;
+    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
+    }
+    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);
+    // todo: abandoned blocks
+    if (show_purge) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
+    }
+  }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
+  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
+
+
+
+/* -----------------------------------------------------------
+  Abandoned pages
+----------------------------------------------------------- */
+
+void mi_arena_page_abandon(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  if (mi_page_is_full(page)) {}
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_free is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_size_of_blocks(blocks);
+  void* const p = mi_arena_block_start(arena, block_idx);
+  bool needs_recommit;
+  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+  }
+
+  // clear the purged blocks
+  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
+
+  // update committed bitmap
+  if (needs_recommit) {
+    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+  }
+}
+
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, block_idx, blocks, stats);
+  }
+  else {
+    // schedule decommit
+    _mi_error_message(EFAULT, "purging not yet implemented\n");
+  }
+}
+
+
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  _mi_error_message(EFAULT, "purging not yet implemented\n");
+  MI_UNUSED(stats);
+  MI_UNUSED(visit_all);
+  MI_UNUSED(force);
+}
+
+
+#if 0
+
 #define MI_IN_ARENA_C
 #include "arena-abandon.c"
 #undef MI_IN_ARENA_C
@@ -116,12 +900,12 @@ static size_t mi_block_count_of_size(size_t size) {
   return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
-static size_t mi_arena_block_size(size_t bcount) {
+static size_t mi_size_of_blocks(size_t bcount) {
   return (bcount * MI_ARENA_BLOCK_SIZE);
 }
 
 static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_arena_block_size(arena->block_count);
+  return mi_size_of_blocks(arena->block_count);
 }
 
 static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
@@ -207,7 +991,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
 }
 
 void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
+  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
 }
 
 
@@ -216,7 +1000,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
@@ -268,7 +1052,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
         memid->initially_committed = false;
       }
       else {
@@ -293,7 +1077,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
   const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_arena_block_size(bcount));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
 
   // Check arena suitability
   mi_arena_t* arena = mi_arena_from_index(arena_index);
@@ -439,7 +1223,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (arena_index >= MI_MAX_ARENAS) return NULL;
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
   return arena->start;
 }
 
@@ -459,7 +1243,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_committed != NULL);
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_arena_block_size(blocks);
+  const size_t size = mi_size_of_blocks(blocks);
   void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
@@ -511,25 +1295,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 // purge a range of blocks
 // return true if the full range was purged.
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startidx + bitlen;
-  size_t bitidx = startidx;
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startseqx + bitlen;
+  size_t bitseqx = startseqx;
   bool all_purged = false;
-  while (bitidx < endidx) {
+  while (bitseqx < endidx) {
     // count consecutive ones in the purge mask
     size_t count = 0;
-    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
+    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
       count++;
     }
     if (count > 0) {
       // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
       mi_arena_purge(arena, range_idx, count, stats);
       if (count == bitlen) {
         all_purged = true;
       }
     }
-    bitidx += (count+1); // +1 to skip the zero bit (or end)
+    bitseqx += (count+1); // +1 to skip the zero bit (or end)
   }
   return all_purged;
 }
@@ -551,16 +1335,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
-      size_t bitidx = 0;
-      while (bitidx < MI_BITMAP_FIELD_BITS) {
+      size_t bitseqx = 0;
+      while (bitseqx < MI_BITMAP_FIELD_BITS) {
         // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
-        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
+        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
           bitlen++;
         }
         // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
         while( bitlen > 0 ) {
           if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
             break;
@@ -571,15 +1355,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         if (bitlen > 0) {
           // read purge again now that we have the in_use bits
           purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
+          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
             full_purge = false;
           }
           any_purged = true;
           // release the claimed `in_use` bits again
           _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
         }
-        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitidx
+        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitseqx
     } // purge != 0
   }
   // if not fully purged, make sure to purge again in the future
@@ -742,7 +1526,7 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
   }
@@ -818,8 +1602,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_assert_internal(post >= 0);
   if (post > 0) {
     // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
   }
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
@@ -986,3 +1770,4 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 }
 
 
+#endif
\ No newline at end of file
diff --git a/src/bitmap-old.c b/src/bitmap-old.c
new file mode 100644
index 00000000..3e6311dc
--- /dev/null
+++ b/src/bitmap-old.c
@@ -0,0 +1,419 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represented as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((size_t)1 << count) - 1) << bitidx);
+}
+
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#if MI_HAS_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const size_t mapm = (map & m);
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const size_t newmap = (map | m);
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#if MI_HAS_FAST_BITSCAN
+      mi_assert_internal(mapm != 0);
+      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
+  return ((field & mask) == mask);
+}
+
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
+  do  {
+    if ((expected & mask) != 0) return false;
+  }
+  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
+  mi_assert_internal((expected & mask) == 0);
+  return true;
+}
+
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+
+  // check initial trailing zeros
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  size_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;  // some part is already claimed
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
+  // now try to claim the range atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
+  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
+
+  // initial field
+  size_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = (map | initial_mask);
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = (map | final_mask);
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  mi_stat_counter_increase(stats->arena_crossover_count,1);
+  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
+  return true;
+
+rollback:
+  // roll back intermediate fields
+  // (we just failed to claim `field` so decrement first)
+  while (--field > initial_field) {
+    newmap = 0;
+    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = (map & ~initial_mask);
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }
+  mi_stat_counter_increase(stats->arena_rollback_count,1);
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries <= 2) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
+  mi_assert_internal(count > 0);
+  if (count <= 2) {
+    // we don't bother with crossover fields for small counts
+    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  }
+
+  // visit the fields
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    // first try to claim inside a field
+    /*
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+        return true;
+      }
+    }
+    */
+    // if that fails, then try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_one = true;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) { *pany_zero = any_zero; }
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1.
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & mid_mask) != mid_mask) all_ones = false;
+    if ((prev & mid_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & post_mask) != post_mask) all_ones = false;
+    if ((prev & post_mask) != 0) any_ones = true;
+  }
+  if (pany_ones != NULL) { *pany_ones = any_ones; }
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
diff --git a/src/bitmap-old.h b/src/bitmap-old.h
new file mode 100644
index 00000000..f8898935
--- /dev/null
+++ b/src/bitmap-old.h
@@ -0,0 +1,110 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represented as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return mi_bitmap_index_create_ex(idx,bitidx);
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
diff --git a/src/bitmap.c b/src/bitmap.c
index 3e6311dc..463d74c7 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1,19 +1,12 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
@@ -21,399 +14,586 @@ between the fields. (This is used in arena allocation)
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  bfields
+-------------------------------------------------------------------------------- */
 
-// The bit mask for a given number of blocks at a specified bit index.
-static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  if (count == 0) return 0;
-  return ((((size_t)1 << count) - 1) << bitidx);
+static inline size_t mi_bfield_ctz(mi_bfield_t x) {
+  return mi_ctz(x);
+}
+
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
+
+// find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x,idx);
+}
+
+static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+  return mi_rotr(x,r);
+}
+
+// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
+static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  if (set) {
+    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
+    return ((old&mask) == 0);
+  }
+  else {
+    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
+    return ((old&mask) == mask);
+  }
+}
+
+// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
+// `already_xset` is true if all bits for the mask were already set/cleared.
+static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+    *already_xset = ((old&mask) == mask);
+    return ((old&mask) == 0);
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+    *already_xset = ((old&mask) == 0);
+    return ((old&mask) == mask);
+  }
+}
+
+// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
+  return mi_bfield_atomic_xset(set, b, idx);
 }
 
 
+// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != 0) return false; // the mask bits are no longer 0
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
+    return true;
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != mask) return false; // the mask bits are no longer set
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
+    return true;
+  }
+}
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+// Check if all bits corresponding to a mask are set/cleared.
+static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    return ((*b & mask) == mask);
+  }
+  else {
+    return ((*b & mask) == 0);
+  }
+}
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map  = mi_atomic_load_relaxed(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_xset_mask(set,b,mask);
+}
 
-  // search for 0-bit sequence of length count
-  const size_t mask = mi_bitmap_mask_(count, 0);
-  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
 
-#if MI_HAS_FAST_BITSCAN
-  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
-#endif
-  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+/* --------------------------------------------------------------------------------
+ bitmap chunks
+-------------------------------------------------------------------------------- */
 
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    const size_t mapm = (map & m);
-    if (mapm == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = (map | m);
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
-        // no success, another thread claimed concurrently.. keep going (with updated `map`)
-        continue;
+static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
+  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+  const size_t i   = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
+}
+
+static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
+  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
+  const size_t i         = byte_idx / MI_BFIELD_SIZE;
+  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
+}
+
+// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_transition = true;
+  bool all_already_xset = true;
+  size_t idx   = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    bool already_xset;
+    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
+    all_already_xset = all_already_xset && already_xset;
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  *palready_xset = all_already_xset;
+  return all_transition;
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_xset = true;
+  size_t idx = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  return all_xset;
+}
+
+// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving all bit fields as is.
+static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  size_t start_idx = cidx % MI_BFIELD_BITS;
+  size_t start_field = cidx / MI_BFIELD_BITS;
+  size_t end_field = MI_BITMAP_CHUNK_FIELDS;
+  size_t mask_mid = 0;
+  size_t mask_end = 0;
+
+  // first field
+  size_t field = start_field;
+  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
+  if (m > n) { m = n; }
+  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
+  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
+  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
+  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false;
+
+  // done?
+  n -= m;
+  if (n==0) return true;
+
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
+
+  // mid fields
+  while (n >= MI_BFIELD_BITS) {
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mask_mid = ~MI_ZU(0);
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
+    n -= MI_BFIELD_BITS;
+  }
+
+  // last field
+  if (n > 0) {
+    mi_assert_internal(n < MI_BFIELD_BITS);
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    end_field = field;
+    mask_end = (MI_ZU(1)<<n)-1;
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
+  }
+
+  return true;
+
+restore:
+  // field is on the field that failed to set atomically; we need to restore all previous fields
+  mi_assert_internal(field > start_field);
+  while( field > start_field) {
+    field--;
+    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
+    bool already_xset;
+    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
+  }
+  return false;
+}
+
+
+// find least 1-bit in a chunk and try unset it atomically
+// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
+    mi_assert_internal(mask != 0);
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    size_t cidx;
+    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
+        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+    // try again
+  }
+  #else
+  size_t idx;
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    size_t idx;
+    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
+        *pidx = (i*MI_BFIELD_BITS + idx);
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+  }
+  return false;
+  #endif
+}
+
+
+// find least byte in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
+    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
+    if (mask == 0) return false;
+    const size_t i = _tzcnt_u32(mask);
+    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
+    const size_t chunk_idx = i / MI_BFIELD_SIZE;
+    const size_t byte_idx  = i % MI_BFIELD_SIZE;
+    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
+      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+      return true;
+    }
+    // try again
+  }
+  #else
+    size_t idx;
+    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+      const mi_bfield_t x = chunk->bfields[i];
+      // has_set8 has low bit in each byte set if the byte in x == 0xFF
+      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+                                    >> 7;                           // shift high bit to low bit
+      size_t idx;
+      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
+        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+        mi_assert_internal((idx%8)==0);
+        const size_t byte_idx = idx/8;
+        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        // else continue
+      }
+    }
+    return false;
+  #endif
+}
+
+
+// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
+// todo: try avx2 and neon version
+// todo: allow spanning across bfield boundaries?
+static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
+  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    mi_bfield_t b = chunk->bfields[i];
+    size_t bshift = 0;
+    size_t idx;
+    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      b >>= idx;
+      bshift += idx;
+      if (bshift + n >= MI_BFIELD_BITS) break;
+
+      if ((b&mask) == mask) { // found a match
+        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
+        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
+          *pidx = (i*MI_BFIELD_BITS) + bshift;
+          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        else {
+          // if failed to atomically commit, try again from this position
+          b = (chunk->bfields[i] >> bshift);
+        }
       }
       else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
-        return true;
+        // advance
+        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
+        mi_assert_internal(ones>0);
+        bshift += ones;
+        b >>= ones;
       }
     }
+  }
+  return false;
+}
+
+
+// are all bits in a bitmap chunk set?
+static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_test_all_ones(vec);
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x & chunk->bfields[i];
+  }
+  return (~x == 0);
+  #endif
+}
+
+// are all bits in a bitmap chunk clear?
+static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_testz_si256( vec, vec );
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x | chunk->bfields[i];
+  }
+  return (x == 0);
+  #endif
+}
+
+/* --------------------------------------------------------------------------------
+ bitmap
+-------------------------------------------------------------------------------- */
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
+  if (!already_zero) {
+    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
+  }
+}
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
+
+  // first chunk
+  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
+  if (m > n) { m = n; }
+  bool already_xset;
+  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
+
+  // n can be large so use memset for efficiency for all in-between chunks
+  chunk_idx++;
+  n -= m;
+  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
+  if (mid_chunks > 0) {
+    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
+    chunk_idx += mid_chunks;
+    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
+  }
+
+  // last chunk
+  if (n > 0) {
+    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
+  }
+}
+
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
+  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
+}
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  mi_assert_internal(idx%8 == 0);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
+  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
+  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  bool local_already_xset;
+  if (already_xset==NULL) { already_xset = &local_already_xset;  }
+  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
+  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
+}
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+
+#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
+  { size_t _set_idx; \
+    size_t _start = start % MI_BFIELD_BITS; \
+    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
+    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
+      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
+
+#define mi_bitmap_forall_set_chunks_end() \
+      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
+      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
+      _any_set >>= 1; \
+    } \
+  }
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
+      return true;
+    }
     else {
-      // on to the next bit range
-#if MI_HAS_FAST_BITSCAN
-      mi_assert_internal(mapm != 0);
-      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
-      mi_assert_internal(shift > 0 && shift <= count);
-#else
-      const size_t shift = 1;
-#endif
-      bitidx += shift;
-      m <<= shift;
-    }
-  }
-  // no bits found
-  return false;
-}
-
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
-}
-
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
-  return ((prev & mask) == 0);
-}
-
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
-  return ((field & mask) == mask);
-}
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {
-    if ((expected & mask) != 0) return false;
-  }
-  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
-  mi_assert_internal((expected & mask) == 0);
-  return true;
-}
-
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
-}
-
-
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
-
-// Try to atomically claim a sequence of `count` bits starting from the field
-// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-
-  // check initial trailing zeros
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);
-  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
-  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
-  if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
-  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
-  // scan ahead
-  size_t found = initial;
-  size_t mask = 0;     // mask bits for the final field
-  while(found < count) {
-    field++;
-    map = mi_atomic_load_relaxed(field);
-    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
-    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
-    mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;  // some part is already claimed
-    found += mask_bits;
-  }
-  mi_assert_internal(field < &bitmap[bitmap_fields]);
-
-  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
-  // now try to claim the range atomically
-  mi_bitmap_field_t* const final_field = field;
-  const size_t final_mask = mask;
-  mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
-  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
-
-  // initial field
-  size_t newmap;
-  field = initial_field;
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | initial_mask);
-    if ((map & initial_mask) != 0) { goto rollback; };
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // intermediate fields
-  while (++field < final_field) {
-    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    map = 0;
-    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
-  }
-
-  // final field
-  mi_assert_internal(field == final_field);
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | final_mask);
-    if ((map & final_mask) != 0) { goto rollback; }
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // claimed!
-  mi_stat_counter_increase(stats->arena_crossover_count,1);
-  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
-  return true;
-
-rollback:
-  // roll back intermediate fields
-  // (we just failed to claim `field` so decrement first)
-  while (--field > initial_field) {
-    newmap = 0;
-    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
-    mi_atomic_store_release(field, newmap);
-  }
-  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
-    map = mi_atomic_load_relaxed(field);
-    do {
-      mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = (map & ~initial_mask);
-    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }
-  mi_stat_counter_increase(stats->arena_rollback_count,1);
-  // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
-  }
-  else {
-    return false;
-  }
-}
-
-
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
-  mi_assert_internal(count > 0);
-  if (count <= 2) {
-    // we don't bother with crossover fields for small counts
-    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
-  }
-
-  // visit the fields
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    // first try to claim inside a field
-    /*
-    if (count <= MI_BITMAP_FIELD_BITS) {
-      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-        return true;
+      // we may find that all are unset only on a second iteration but that is ok as
+      // _any_set is a conservative approximation.
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
       }
     }
-    */
-    // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
-      return true;
-    }
   }
+  mi_bitmap_forall_set_chunks_end();
   return false;
 }
 
-// Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED(bitmap_fields);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
-    *pre_mask = mi_bitmap_mask_(count, bitidx);
-    *mid_mask = 0;
-    *post_mask = 0;
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
-    return 0;
-  }
-  else {
-    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
-    mi_assert_internal(pre_bits < count);
-    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
-    count -= pre_bits;
-    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
-    *mid_mask = MI_BITMAP_FIELD_FULL;
-    count %= MI_BITMAP_FIELD_BITS;
-    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
-    return mid_count;
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
+      mi_assert_internal((*pidx % 8) == 0);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
   }
+  mi_bitmap_forall_set_chunks_end();
+  return false;
 }
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_one = true;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
-  if ((prev & pre_mask) != pre_mask) all_one = false;
-  while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
-    if ((prev & mid_mask) != mid_mask) all_one = false;
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
+  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
+  // TODO: allow spanning across chunk boundaries
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
   }
-  if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
-    if ((prev & post_mask) != post_mask) all_one = false;
-  }
-  return all_one;
-}
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_zero = true;
-  bool any_zero = false;
-  _Atomic(size_t)*field = &bitmap[idx];
-  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) all_zero = false;
-  if ((prev & pre_mask) != pre_mask) any_zero = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) all_zero = false;
-    if ((prev & mid_mask) != mid_mask) any_zero = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) all_zero = false;
-    if ((prev & post_mask) != post_mask) any_zero = true;
-  }
-  if (pany_zero != NULL) { *pany_zero = any_zero; }
-  return all_zero;
-}
-
-
-// Returns `true` if all `count` bits were 1.
-// `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_ones = true;
-  bool any_ones = false;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_load_relaxed(field++);
-  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) any_ones = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_load_relaxed(field++);
-    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) any_ones = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_load_relaxed(field);
-    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) any_ones = true;
-  }
-  if (pany_ones != NULL) { *pany_ones = any_ones; }
-  return all_ones;
-}
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+  mi_bitmap_forall_set_chunks_end();
+  return false;
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index f8898935..198a2902 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -6,105 +6,87 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  Definitions
+-------------------------------------------------------------------------------- */
 
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+typedef size_t mi_bfield_t;
 
-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
+#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
+#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
 
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
+#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return mi_bitmap_index_create_ex(idx,bitidx);
-}
-
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
-
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
-
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
-
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
-
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
+} mi_bitmap_chunk_t;
 
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+typedef mi_decl_align(32) struct mi_bitmap_s {
+  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
+  _Atomic(mi_bfield_t)any_set;
+} mi_bitmap_t;
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+/* --------------------------------------------------------------------------------
+  Bitmap
+-------------------------------------------------------------------------------- */
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+typedef bool  mi_bit_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
 
-#endif
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
+
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
+
+#endif // MI_XBITMAP_H
diff --git a/src/free.c b/src/free.c
index f2e30b65..e1cc9276 100644
--- a/src/free.c
+++ b/src/free.c
@@ -24,7 +24,7 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // ------------------------------------------------------
 
 // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
+static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_block_t* block);
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
@@ -57,7 +57,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
-  size_t diff = (uint8_t*)p - page->page_start;
+  size_t diff = (uint8_t*)p - mi_page_start(page);
   size_t adjust;
   if mi_likely(page->block_size_shift != 0) {
     adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@@ -82,72 +82,55 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 #endif
 
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  MI_UNUSED(segment);
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
   mi_block_check_unguard(page, block, p);
   mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_block_check_unguard(page, block, p);
-  mi_free_block_mt(page, segment, block);
+  mi_free_block_mt(page, block);
 }
 
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
-           else mi_free_generic_mt(page,segment,p);
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
 }
 
 // Get the segment data belonging to a pointer
 // This is just a single `and` in release mode but does further checks in debug mode
 // (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 {
-  MI_UNUSED(msg);
-
-  #if (MI_DEBUG>0)
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
   if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
   #endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment==NULL) return segment;
-
-  #if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+  mi_page_t* const page = _mi_ptr_page(p);
+  #if MI_DEBUG
+  if (page == MI_PAGE_PTR_INVALID) {
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
   }
   #endif
-  #if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-  #endif
-
-  return segment;
+  return page;
 }
 
 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment==NULL) return;
-
-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
+  if mi_unlikely(page==NULL) return;
 
+  
+  const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
@@ -156,12 +139,12 @@ void mi_free(void* p) mi_attr_noexcept
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
+      mi_free_generic_local(page, p);
     }
   }
   else {
     // not thread-local; use generic path
-    mi_free_generic_mt(page, segment, p);
+    mi_free_generic_mt(page, p);
   }
 }
 
@@ -169,10 +152,8 @@ void mi_free(void* p) mi_attr_noexcept
 bool _mi_free_delayed_block(mi_block_t* block) {
   // get segment and page
   mi_assert_internal(block!=NULL);
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
+  mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block");
+  mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
 
   // Clear the no-delayed flag so delayed freeing is used again for this page.
   // This must be done before collecting the free lists on this page -- otherwise
@@ -242,20 +223,19 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 }
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE &&
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
+  // first see if the page was abandoned and if we can reclaim it into our thread
+  if (mi_page_is_abandoned(page) &&
+      (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
+       mi_page_is_singleton(page)  // only one block, and we are free-ing it
+      ) &&
       mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
   {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
+    // the page is abandoned, try to reclaim it into our heap
+    if (_mi_heap_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
+      mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
+      // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;
     }
@@ -272,17 +252,12 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
   _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
+  if (mi_page_is_huge(page)) {
+    mi_assert_internal(mi_page_is_singleton(page));
     // huge pages are special as they occupy the entire segment
     // as these are large we reset the memory occupied by the page so it is available to other threads
     // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
+    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively    
   }
   else {
     #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
@@ -316,9 +291,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if mi_unlikely(segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const mi_page_t* const page = mi_checked_ptr_page(p,msg);
+  if mi_unlikely(page==NULL) return 0;
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;
     return mi_page_usable_size_of(page, block);
@@ -514,20 +488,20 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
+  #if (MI_STAT < 2)
   MI_UNUSED(block);
-#endif
+  #endif
   mi_heap_t* const heap = mi_heap_get_default();
   const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
+  #if (MI_STAT>1)
   const size_t usize = mi_page_usable_size_of(page, block);
   mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
+    #if (MI_STAT > 1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
+    #endif
   }
   else {
     const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
diff --git a/src/heap.c b/src/heap.c
index 581b3f71..e4955ba7 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
-#include <string.h>  // memset, memcpy
-
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
 #pragma warning(disable:4204)  // non-constant aggregate initializer
 #endif
@@ -258,7 +255,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
   mi_assert_internal(heap != NULL);
   mi_assert_internal(mi_heap_is_initialized(heap));
   // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+  _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
   _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
   heap->thread_delayed_free = NULL;
   heap->page_count = 0;
diff --git a/src/os.c b/src/os.c
index 36b167cb..83521766 100644
--- a/src/os.c
+++ b/src/os.c
@@ -59,6 +59,10 @@ size_t _mi_os_large_page_size(void) {
   return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
+size_t _mi_os_virtual_address_bits(void) {
+  return mi_os_mem_config.virtual_address_bits;
+}
+
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
   if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
@@ -103,58 +107,10 @@ static void* mi_align_down_ptr(void* p, size_t alignment) {
   return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 
-
-/* -----------------------------------------------------------
-  aligned hinting
--------------------------------------------------------------- */
-
-// On systems with enough virtual address bits, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
-// space (64TiB) we use this technique. (but see issue #939)
-#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
 }
-#endif
 
 
 /* -----------------------------------------------------------
@@ -380,12 +336,10 @@ void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
 ----------------------------------------------------------- */
 
 void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
-  mi_assert(offset <= MI_SEGMENT_SIZE);
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
   if (stats == NULL) stats = &_mi_stats_main;
-  if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
@@ -605,7 +559,6 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     #endif
     }
     end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
   } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
diff --git a/src/page-map.c b/src/page-map.c
new file mode 100644
index 00000000..d3fcef79
--- /dev/null
+++ b/src/page-map.c
@@ -0,0 +1,90 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+mi_decl_cache_align signed char* _mi_page_map = NULL;
+static bool        mi_page_map_all_committed = false;
+static size_t      mi_blocks_per_commit_bit = 1;
+static mi_memid_t  mi_page_map_memid;
+static mi_bitmap_t mi_page_map_commit;
+
+static bool mi_page_map_init(void) {
+  size_t vbits = _mi_os_virtual_address_bits();
+  if (vbits >= 48) vbits = 47;
+  // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
+  //                    64 KiB for 4 GiB address space (on 32-bit)
+  const size_t page_map_size = (MI_ZU(1) << (vbits >> MI_ARENA_BLOCK_SHIFT));
+  
+  const size_t min_commit_size = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
+  mi_blocks_per_commit_bit = mi_block_count_of_size(min_commit_size);
+
+  mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
+  _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 0, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("the page map was committed on-demand but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  return true;
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; }  // furthest interior pointer
+  *block_count = mi_block_count_of_size(page_size);
+  return ((uintptr_t)*page_start >> MI_ARENA_BLOCK_SHIFT);
+}
+
+
+
+void _mi_page_map_register(mi_page_t* page) {
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!mi_page_map_init()) return;
+  }
+  uint8_t* page_start;
+  size_t   block_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
+  
+  // is the page map area that contains the page address committed?
+  if (!mi_page_map_all_committed) {
+    const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit);
+    const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit;
+    for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
+        // this may race, in which case we do multiple commits (which is ok)
+        _mi_os_commit(page_start + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL);
+        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
+      }
+    }
+  }
+
+  // set the offsets
+  for (int i = 0; i < block_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (int8_t)(-i-1);
+  }
+}
+
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  
+  // get index and count
+  uint8_t* page_start;
+  size_t   block_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
+
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, block_count);
+}
diff --git a/src/page.c b/src/page.c
index c681d6d0..a00ff615 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -83,10 +83,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->capacity <= page->reserved);
 
   // const size_t bsize = mi_page_block_size(page);
-  mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -122,15 +119,11 @@ bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(page->keys[0] != 0);
   #endif
   if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
-    #if MI_HUGE_PAGE_ABANDON
-    if (segment->page_kind != MI_PAGE_HUGE)
-    #endif
+    mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page));
       mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
   }
@@ -274,16 +267,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if !MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq != NULL);
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   // a fresh page was found, initialize it
   const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
@@ -384,7 +374,6 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_heap_t* pheap = mi_page_heap(page);
 
   // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
   mi_page_queue_remove(pq, page);
 
   // page is no longer associated with our heap
@@ -399,8 +388,8 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 #endif
 
   // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  _mi_arena_page_abandon(page,&pheap->tld);
 }
 
 // force abandon a page
@@ -411,8 +400,7 @@ void _mi_page_force_abandon(mi_page_t* page) {
 
   // ensure this page is no longer in the heap delayed free list
   _mi_heap_delayed_free_all(heap);
-  // We can still access the page meta-info even if it is freed as we ensure 
-  // in `mi_segment_force_abandon` that the segment is not freed (yet)
+  // TODO: can we still access the page meta-info even if it is freed?
   if (page->capacity == 0) return; // it may have been freed now
 
   // and now unlink it from the page queue and abandon (or free)
@@ -433,17 +421,18 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_assert_internal(mi_page_all_free(page));
   mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
 
+  mi_heap_t* pheap = mi_page_heap(page);
+
   // no more aligned blocks in here
   mi_page_set_has_aligned(page, false);
 
   // remove from the page list
   // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments;
   mi_page_queue_remove(pq, page);
 
   // and free it
   mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  _mi_arena_page_free(page, force, &pheap->tld);
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@@ -474,7 +463,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -639,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
+  mi_page_area(page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
@@ -676,15 +665,13 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 // Initialize a fresh page
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
   mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
   mi_assert_internal(block_size > 0);
   // set fields
   mi_page_set_heap(page, heap);
   page->block_size = block_size;
   size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
-  mi_track_mem_noaccess(page->page_start,page_size);
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  mi_track_mem_noaccess(page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -692,15 +679,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  page->free_is_zero = page->is_zero_init;
+  page->free_is_zero = page->memid.initially_zero;
   #if MI_DEBUG>2
-  if (page->is_zero_init) {
+  if (page->memid.initially_zero) {
     mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+    mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
+    page->block_size_shift = (uint8_t)mi_ctz(block_size);
   }
   else {
     page->block_size_shift = 0;
@@ -734,13 +721,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
 #define MI_MAX_CANDIDATE_SEARCH  (8)
 
-// is the page not yet used up to its reserved space?
-static bool mi_page_is_expandable(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(page->capacity <= page->reserved);
-  return (page->capacity < page->reserved);
-}
-
 
 // Find a page with free blocks of `page->block_size`.
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
@@ -907,7 +887,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
   mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@@ -915,10 +895,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
     mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(mi_page_is_huge(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
-    mi_assert_internal(_mi_page_segment(page)->used==1);
+    mi_assert_internal(mi_page_is_singleton(page));
     #if MI_HUGE_PAGE_ABANDON
-    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_assert_internal(mi_page_is_abandoned(page));
     mi_page_set_heap(page, NULL);
     #endif
     mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
@@ -933,7 +912,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
diff --git a/src/static.c b/src/static.c
index 9e06ce05..b34d5d42 100644
--- a/src/static.c
+++ b/src/static.c
@@ -20,7 +20,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
@@ -31,6 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
+#include "page-map.c"
 #include "random.c"
 #include "segment.c"
 #include "segment-map.c"
diff --git a/src/xbitmap.c b/src/xbitmap.c
deleted file mode 100644
index 68525c84..00000000
--- a/src/xbitmap.c
+++ /dev/null
@@ -1,599 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically
----------------------------------------------------------------------------- */
-
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/bits.h"
-#include "xbitmap.h"
-
-/* --------------------------------------------------------------------------------
-  bfields
--------------------------------------------------------------------------------- */
-
-static inline size_t mi_bfield_ctz(mi_bfield_t x) {
-  return mi_ctz(x);
-}
-
-static inline size_t mi_bfield_clz(mi_bfield_t x) {
-  return mi_clz(x);
-}
-
-// find the least significant bit that is set (i.e. count trailing zero's)
-// return false if `x==0` (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
-  return mi_bsf(x,idx);
-}
-
-static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-  return mi_rotr(x,r);
-}
-
-// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
-  if (set) {
-    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
-    return ((old&mask) == 0);
-  }
-  else {
-    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
-    return ((old&mask) == mask);
-  }
-}
-
-// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-// `already_xset` is true if all bits for the mask were already set/cleared.
-static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
-    *already_xset = ((old&mask) == mask);
-    return ((old&mask) == 0);
-  }
-  else { // clear
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
-    *already_xset = ((old&mask) == 0);
-    return ((old&mask) == mask);
-  }
-}
-
-// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
-  return mi_bfield_atomic_xset(set, b, idx);
-}
-
-
-// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
-// and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != 0) return false; // the mask bits are no longer 0
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
-    return true;
-  }
-  else { // clear
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != mask) return false; // the mask bits are no longer set
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
-    return true;
-  }
-}
-
-// Check if all bits corresponding to a mask are set/cleared.
-static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    return ((*b & mask) == mask);
-  }
-  else {
-    return ((*b & mask) == 0);
-  }
-}
-
-// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_xset_mask(set,b,mask);
-}
-
-
-/* --------------------------------------------------------------------------------
- bitmap chunks
--------------------------------------------------------------------------------- */
-
-static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
-  const size_t i   = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
-}
-
-static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
-  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
-  const size_t i         = byte_idx / MI_BFIELD_SIZE;
-  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
-}
-
-// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  bool all_transition = true;
-  bool all_already_xset = true;
-  size_t idx   = cidx % MI_BFIELD_BITS;
-  size_t field = cidx / MI_BFIELD_BITS;  
-  while (n > 0) {
-    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
-    if (m > n) { m = n; }
-    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    bool already_xset;
-    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
-    all_already_xset = all_already_xset && already_xset;
-    // next field
-    field++;
-    idx = 0;
-    n -= m;
-  }
-  *palready_xset = all_already_xset;
-  return all_transition;
-}
-
-// Check if a sequence of `n` bits within a chunk are all set/cleared.
-static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  bool all_xset = true;
-  size_t idx = cidx % MI_BFIELD_BITS;
-  size_t field = cidx / MI_BFIELD_BITS;
-  while (n > 0) {
-    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
-    if (m > n) { m = n; }
-    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
-    // next field
-    field++;
-    idx = 0;
-    n -= m;
-  }
-  return all_xset;
-}
-
-// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
-// and false otherwise leaving all bit fields as is.
-static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  if (n==0) return true;
-  size_t start_idx = cidx % MI_BFIELD_BITS;
-  size_t start_field = cidx / MI_BFIELD_BITS;
-  size_t end_field = MI_BITMAP_CHUNK_FIELDS; 
-  size_t mask_mid = 0;
-  size_t mask_end = 0; 
-  
-  // first field  
-  size_t field = start_field;
-  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
-  if (m > n) { m = n; }
-  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
-  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
-  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
-  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false; 
-  
-  // done?
-  n -= m;
-  if (n==0) return true;
-
-  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
-
-  // mid fields
-  while (n >= MI_BFIELD_BITS) {
-    field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    mask_mid = ~MI_ZU(0);
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
-    n -= MI_BFIELD_BITS;
-  }
-
-  // last field
-  if (n > 0) {    
-    mi_assert_internal(n < MI_BFIELD_BITS);
-    field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    end_field = field;
-    mask_end = (MI_ZU(1)<<n)-1;
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
-  }
-
-  return true;
-  
-restore:
-  // field is on the field that failed to set atomically; we need to restore all previous fields
-  mi_assert_internal(field > start_field);
-  while( field > start_field) {
-    field--;
-    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
-    bool already_xset;
-    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
-  }
-  return false;
-}
-
-
-// find least 1-bit in a chunk and try unset it atomically
-// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
-// todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  while(true) {
-    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
-    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
-    mi_assert_internal(mask != 0);
-    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-    // try again
-  }
-  #else
-  size_t idx;
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    size_t idx;
-    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
-        *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-  }
-  return false;
-  #endif
-}
-
-
-// find least byte in a chunk with all bits set, and try unset it atomically
-// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
-// todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  while(true) {
-    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
-    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
-    if (mask == 0) return false;
-    const size_t i = _tzcnt_u32(mask);
-    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
-    const size_t chunk_idx = i / MI_BFIELD_SIZE;
-    const size_t byte_idx  = i % MI_BFIELD_SIZE;
-    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
-      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
-      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-      return true;
-    }
-    // try again
-  }
-  #else
-    size_t idx;
-    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-      const mi_bfield_t x = chunk->bfields[i];
-      // has_set8 has low bit in each byte set if the byte in x == 0xFF
-      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
-                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
-                                    >> 7;                           // shift high bit to low bit
-      size_t idx;
-      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
-        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-        mi_assert_internal((idx%8)==0);
-        const size_t byte_idx = idx/8;
-        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
-          *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
-          return true;
-        }
-        // else continue
-      }
-    }
-    return false;
-  #endif
-}
-
-
-// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
-// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-// todo: try avx2 and neon version
-// todo: allow spanning across bfield boundaries?
-static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
-  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    mi_bfield_t b = chunk->bfields[i];
-    size_t bshift = 0;
-    size_t idx;
-    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      b >>= idx;
-      bshift += idx;
-      if (bshift + n >= MI_BFIELD_BITS) break;
-
-      if ((b&mask) == mask) { // found a match
-        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
-        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
-          *pidx = (i*MI_BFIELD_BITS) + bshift;
-          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
-          return true;
-        }
-        else {
-          // if failed to atomically commit, try again from this position
-          b = (chunk->bfields[i] >> bshift);
-        }
-      }
-      else {
-        // advance
-        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
-        mi_assert_internal(ones>0);
-        bshift += ones;
-        b >>= ones;
-      }
-    }
-  }
-  return false;
-}
-
-
-// are all bits in a bitmap chunk set?
-static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_test_all_ones(vec);
-  #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x & chunk->bfields[i];
-  }
-  return (~x == 0);
-  #endif
-}
-
-// are all bits in a bitmap chunk clear?
-static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_testz_si256( vec, vec );
-  #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x | chunk->bfields[i];
-  }
-  return (x == 0);
-  #endif
-}
-
-/* --------------------------------------------------------------------------------
- bitmap
--------------------------------------------------------------------------------- */
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
-  if (!already_zero) {
-    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
-  }
-}
-
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
-
-  // first chunk
-  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
-  if (m > n) { m = n; }
-  bool already_xset;
-  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
-
-  // n can be large so use memset for efficiency for all in-between chunks
-  chunk_idx++;
-  n -= m;
-  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
-  if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
-    chunk_idx += mid_chunks;
-    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
-  }
-
-  // last chunk
-  if (n > 0) {
-    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
-  }
-}
-
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
-// and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
-  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
-}
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
-  mi_assert_internal(idx%8 == 0);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
-  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
-}
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
-  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
-
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
-}
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  bool local_already_xset;
-  if (already_xset==NULL) { already_xset = &local_already_xset;  }
-  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
-  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
-
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
-}
-
-// Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
-}
-
-
-#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
-  { size_t _set_idx; \
-    size_t _start = start % MI_BFIELD_BITS; \
-    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
-    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
-      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
-
-#define mi_bitmap_forall_set_chunks_end() \
-      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
-      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
-      _any_set >>= 1; \
-    } \
-  }
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
-      return true;
-    }
-    else {
-      // we may find that all are unset only on a second iteration but that is ok as
-      // _any_set is a conservative approximation.
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
-
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
-      mi_assert_internal((*pidx % 8) == 0);
-      return true;
-    }
-    else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
-  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
-  // TODO: allow spanning across chunk boundaries
-  if (n == 0 || n > MI_BFIELD_BITS) return false;
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
-      return true;
-    }
-    else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
diff --git a/src/xbitmap.h b/src/xbitmap.h
deleted file mode 100644
index 869db2a2..00000000
--- a/src/xbitmap.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically
----------------------------------------------------------------------------- */
-#pragma once
-#ifndef MI_XBITMAP_H
-#define MI_XBITMAP_H
-
-/* --------------------------------------------------------------------------------
-  Definitions
--------------------------------------------------------------------------------- */
-
-typedef size_t mi_bfield_t;
-
-#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
-#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
-#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
-#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
-#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
-#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
-
-#define MI_BITMAP_CHUNK_BITS_SHIFT         (8)                                // 2^8 = 256 bits per chunk
-#define MI_BITMAP_CHUNK_BITS               (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
-#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
-#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
-
-typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
-  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
-} mi_bitmap_chunk_t;
-
-
-typedef mi_decl_align(32) struct mi_bitmap_s {
-  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
-  _Atomic(mi_bfield_t)any_set;
-} mi_bitmap_t;
-
-#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
-
-/* --------------------------------------------------------------------------------
-  Bitmap
--------------------------------------------------------------------------------- */
-
-typedef bool  mi_bit_t;
-#define MI_BIT_SET    (true)
-#define MI_BIT_CLEAR  (false)
-
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
-
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. 
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
-
-// Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-// and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
-
-#endif // MI_XBITMAP_H

From 46afcbe06cd0000eeda5400fba7eb23453237b8c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 14:28:34 -0800
Subject: [PATCH 003/264] wip: further progress on segment removal; arena
 allocation

---
 include/mimalloc/internal.h |   7 +-
 include/mimalloc/types.h    |  17 +-
 src/arena-page.c            |  20 ++
 src/arena.c                 | 368 ++++++++++++++++++++++++++----------
 src/bitmap.c                |  16 +-
 src/bitmap.h                |   6 +-
 src/page-map.c              |   8 +-
 src/page.c                  |  56 +++---
 8 files changed, 344 insertions(+), 154 deletions(-)
 create mode 100644 src/arena-page.c

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 2713c0ac..d60b0c15 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -137,6 +137,9 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
+void       _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld);
+void       _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld);
 
 void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
@@ -181,6 +184,7 @@ void       _mi_deferred_free(mi_heap_t* heap, bool force);
 
 void       _mi_page_free_collect(mi_page_t* page,bool force);
 void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+void       _mi_page_init(mi_heap_t* heap, mi_page_t* page);
 
 size_t     _mi_bin_size(uint8_t bin);           // for stats
 uint8_t    _mi_bin(size_t size);                // for stats
@@ -453,8 +457,7 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 
 // Page start
 static inline uint8_t* mi_page_start(const mi_page_t* page) {
-  mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE);
-  return (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  return page->page_start;
 }
 
 static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 98664020..591cb603 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -127,8 +127,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ARENA_BLOCK_ALIGN              (MI_ARENA_BLOCK_SIZE)
 #define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_ARENA_MIN_OBJ_SIZE             MI_ARENA_BLOCK_SIZE
-#define MI_ARENA_MAX_OBJ_SIZE             (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
+#define MI_ARENA_MIN_OBJ_BLOCKS           (1) 
+#define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
+
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)  
 
 #define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
@@ -141,7 +144,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 
 
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_BLOCK_ALIGN)
 
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
@@ -279,7 +282,6 @@ typedef struct mi_subproc_s mi_subproc_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  mi_memid_t            memid;             // provenance of the page memory
   uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
@@ -293,6 +295,7 @@ typedef struct mi_page_s {
   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
   size_t                block_size;        // size available in each block (always `>0`)  
+  uint8_t*              page_start;        // start of the blocks
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
@@ -304,6 +307,7 @@ typedef struct mi_page_s {
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+  mi_memid_t            memid;             // provenance of the page memory
 } mi_page_t;
 
 
@@ -312,7 +316,7 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     (64)
-#define MI_PAGE_INFO_SIZE                 (MI_SIZE_SHIFT*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+#define MI_PAGE_INFO_SIZE                 (2*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
@@ -532,7 +536,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 // ------------------------------------------------------
 
 struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned pages for this sub-process
+  _Atomic(size_t)    abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process
   _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
   mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
   mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
@@ -562,6 +566,7 @@ struct mi_tld_s {
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
   mi_subproc_t*       subproc;       // sub-process this thread belongs to.
+  size_t              tseq;          // thread sequence id
   mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
 };
diff --git a/src/arena-page.c b/src/arena-page.c
new file mode 100644
index 00000000..93d25dbf
--- /dev/null
+++ b/src/arena-page.c
@@ -0,0 +1,20 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
diff --git a/src/arena.c b/src/arena.c
index 28ad61f1..c9f8400b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,6 +42,7 @@ typedef struct mi_arena_s {
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+  mi_subproc_t*       subproc;
 
   mi_bitmap_t         blocks_free;          // is the block free?
   mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
@@ -99,6 +100,9 @@ mi_arena_t* mi_arena_from_index(size_t idx) {
   return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
 }
 
+mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
+  return mi_arena_from_index(mi_arena_id_index(id));
+}
 
 
 /* -----------------------------------------------------------
@@ -164,14 +168,11 @@ bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block
   Arena Allocation
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
+static mi_decl_noinline void* mi_arena_try_alloc_at(
+  mi_arena_t* arena, size_t needed_bcount, bool commit, size_t tseq, mi_memid_t* memid)
+{  
   size_t block_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
+  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, needed_bcount, tseq, &block_index)) return NULL;
 
   // claimed it!
   void* p = mi_arena_block_start(arena, block_index);
@@ -192,7 +193,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
     if (!all_already_committed) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, NULL)) {
         memid->initially_committed = false;
       }
       else {
@@ -205,75 +206,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
   }
 
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, needed_bcount));
+  if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount)); }
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount));
+  // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, needed_bcount));
+
   return p;
 }
 
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node,
-  size_t size, size_t alignment,
-  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
 
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
@@ -323,56 +263,286 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 }
 
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+
+
+/* -----------------------------------------------------------
+  Arena iteration
+----------------------------------------------------------- */
+
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, mi_subproc_t* subproc, int numa_node, bool allow_large) {
+  if (subproc != NULL && arena->subproc != subproc) return false;
+  if (!allow_large && arena->is_large) return false;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return false;
+  if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (!numa_suitable) return false;
+  }
+  return true;
+}
+
+#define MI_THREADS_PER_ARENA  (16)
+
+#define mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, var_arena_id, var_arena) \
+  { \
+  size_t _max_arena; \
+  size_t _start; \
+  if (req_arena_id == _mi_arena_id_none()) { \
+    _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \
+    _start = (_max_arena <= 1 ? 0 : (tseq / MI_THREADS_PER_ARENA) % _max_arena); \
+  } \
+  else { \
+    _max_arena = 1; \
+    _start = mi_arena_id_index(req_arena_id); \
+    mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \
+  } \
+  for (size_t i = 0; i < _max_arena; i++) { \
+    size_t _idx = i + _start; \
+    if (_idx >= _max_arena) { _idx -= _max_arena; } \
+    const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); \
+    mi_arena_t* const   var_arena = mi_arena_from_index(_idx); \
+    if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
+    {
+
+#define mi_forall_arenas_end()  }}} 
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+// allocate blocks from the arenas
+static mi_decl_noinline void* mi_arena_try_find_free(
+  size_t block_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  size_t tseq = _mi_thread_seq_id();
-  *memid = _mi_memid_none();
+  mi_assert_internal(block_count <= mi_block_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
 
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
+  // search arena's
+  mi_subproc_t* const subproc = tld->subproc;
+  const size_t tseq = tld->tseq;
+  mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, block_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_arenas_end();
+  return NULL;
+}
 
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+// Allocate blocks from the arena's -- potentially allocating a fresh arena
+static mi_decl_noinline void* mi_arena_try_alloc(
+  size_t block_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
+  mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS);
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+
+  void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+  if (p != NULL) return p;
+
+  // otherwise, try to first eagerly reserve a new arena
+  if (req_arena_id == _mi_arena_id_none()) {
+    mi_arena_id_t arena_id = 0;
+    if (mi_arena_reserve(mi_size_of_blocks(block_count), allow_large, req_arena_id, &arena_id)) {
+      // and try allocate in there
+      mi_assert_internal(req_arena_id == _mi_arena_id_none());
+      p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
     }
   }
+}
 
+// Allocate from the OS (if allowed)
+static void* mi_arena_os_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
   // if we cannot use OS allocation, return NULL
   if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
 
-  // finally, fall back to the OS
   if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, &tld->stats);
   }
   else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, &tld->stats);
   }
 }
 
+
+// Allocate large sized memory
+void* _mi_arena_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+
+  // *memid = _mi_memid_none();
+  // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
+      req_arena_id == _mi_arena_id_none() &&                   // not a specific arena?
+      size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
+      alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0)            // and good alignment
+  {
+    const size_t block_count = mi_block_count_of_size(size);
+    void* p = mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+    if (p != NULL) return p;
+  }
+
+  // fall back to the OS
+  return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
+}
+
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
   return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
 
+
+/* -----------------------------------------------------------
+  Arena page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+{
+  const size_t bin = _mi_bin(block_size); 
+  mi_assert_internal(bin < MI_BIN_COUNT);
+
+  // any abandoned in our size class?
+  mi_subproc_t* const subproc = tld->subproc;
+  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL;
+
+  // search arena's
+  const bool allow_large = true;
+  size_t tseq = tld->tseq;
+  mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
+  {
+    size_t block_index;
+    if (mi_bitmap_try_find_and_clear(&arena->blocks_abandoned[bin], tseq, &block_index)) {
+      // found an abandoned page of the right size
+      mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
+      mi_page_t* page = (mi_page_t*)mi_arena_block_start(arena, block_index);
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, block_count));
+      mi_assert_internal(mi_page_block_size(page) == block_size);
+      mi_assert_internal(!mi_page_is_full(page));
+      mi_assert_internal(mi_page_is_abandoned(page));
+      return page;
+    }
+  }
+  mi_forall_arenas_end();
+  return false;
+}
+
+static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+{
+  const bool allow_large = true;
+  const bool commit = true;
+  const size_t alignment = MI_ARENA_BLOCK_ALIGN;
+
+  // try to allocate from free space in arena's
+  mi_memid_t memid;
+  mi_page_t* page = NULL;
+  if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
+    page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
+  }
+
+  // otherwise fall back to the OS
+  if (page == NULL) {
+    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_blocks(block_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+  }
+
+  if (page == NULL) return NULL;
+
+  // claimed free blocks: initialize the page partly
+  _mi_memzero_aligned(page, sizeof(*page));
+  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
+  const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size;
+  mi_assert_internal(reserved > 0 && reserved < UINT16_MAX);
+  page->reserved = reserved;
+  page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  page->block_size = block_size;
+  page->memid = memid;
+  page->free_is_zero = memid.initially_zero;
+  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
+    page->block_size_shift = (uint8_t)mi_ctz(block_size);
+  }
+  else {
+    page->block_size_shift = 0;
+  }
+
+  mi_assert_internal(mi_page_block_size(page) == block_size);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  return page;
+}
+
+// block_count: arena block count for the page
+// block size : page block size
+static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) {
+  const size_t    req_arena_id = heap->arena_id;
+  mi_tld_t* const tld = heap->tld;
+
+  // 1. look for an abandoned page  
+  mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld);
+  if (page != NULL) {
+    _mi_page_reclaim(heap,page);
+    return page;
+  }
+
+  // 2. find a free block, potentially allocating a new arena
+  page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld);
+  if (page != NULL) {    
+    _mi_page_init(heap, page);
+    return page;
+  }
+
+  return NULL;
+}
+
+
+static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+  _mi_error_message(EINVAL, "singleton page is not yet implemented\n");
+  return NULL;
+}
+
+
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+  mi_page_t* page;
+  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  }
+  else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  }
+  else {
+    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  }  
+  // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
+  return page;
+}
+
+
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
diff --git a/src/bitmap.c b/src/bitmap.c
index 463d74c7..9faa9ae9 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -512,9 +512,9 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 }
 
 
-#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
+#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \
   { size_t _set_idx; \
-    size_t _start = start % MI_BFIELD_BITS; \
+    size_t _start = tseq % MI_BFIELD_BITS; \
     mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
     while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
       decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
@@ -530,8 +530,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -554,8 +554,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t star
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -576,11 +576,11 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pi
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
+bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
   // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
   // TODO: allow spanning across chunk boundaries
   if (n == 0 || n > MI_BFIELD_BITS) return false;
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
diff --git a/src/bitmap.h b/src/bitmap.h
index 198a2902..fcadc213 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -79,14 +79,14 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, si
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx );
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
 
 #endif // MI_XBITMAP_H
diff --git a/src/page-map.c b/src/page-map.c
index d3fcef79..cb527886 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -32,9 +32,13 @@ static bool mi_page_map_init(void) {
     return false;
   }
   if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
-    _mi_warning_message("the page map was committed on-demand but not zero initialized!\n");
+    _mi_warning_message("the page map was committed but not zero initialized!\n");
     _mi_memzero_aligned(_mi_page_map, page_map_size);
   }
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!mi_page_map_all_committed) {
+    _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
+  }
   return true;
 }
 
@@ -72,7 +76,7 @@ void _mi_page_map_register(mi_page_t* page) {
   // set the offsets
   for (int i = 0; i < block_count; i++) {
     mi_assert_internal(i < 128);
-    _mi_page_map[idx + i] = (int8_t)(-i-1);
+    _mi_page_map[idx + i] = (signed char)(-i-1);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index a00ff615..fa006085 100644
--- a/src/page.c
+++ b/src/page.c
@@ -119,7 +119,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(page->keys[0] != 0);
   #endif
   if (mi_page_heap(page)!=NULL) {
-    mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0);
+    mi_assert_internal(!_mi_process_is_initialized || mi_page_thread_id(page) == mi_page_heap(page)->thread_id || mi_page_thread_id(page)==0);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
@@ -249,19 +249,22 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_page_set_heap(page, heap);
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) 
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+
   mi_assert_expensive(mi_page_is_valid_init(page));
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
-
+  
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
 
+
+
 // allocate a fresh page from a segment
 static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
   #if !MI_HUGE_PAGE_ABANDON
@@ -269,16 +272,12 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
   mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment);
+  mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
-  // a fresh page was found, initialize it
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
-  mi_assert_internal(full_block_size >= block_size);
-  mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
   if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
@@ -389,7 +388,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 
   // and abandon it
   mi_assert_internal(mi_page_is_abandoned(page));
-  _mi_arena_page_abandon(page,&pheap->tld);
+  _mi_arena_page_abandon(page, pheap->tld);
 }
 
 // force abandon a page
@@ -432,7 +431,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 
   // and free it
   mi_page_set_heap(page,NULL);
-  _mi_arena_page_free(page, force, &pheap->tld);
+  _mi_arena_page_free(page, pheap->tld);
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@@ -617,7 +616,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
   #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
@@ -629,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   size_t page_size;
   //uint8_t* page_start =
   mi_page_area(page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  mi_heap_stat_counter_increase(heap, pages_extended, 1);
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
@@ -651,48 +650,37 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   // and append the extend the free list
   if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+    mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
   }
   else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats);
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  mi_heap_stat_increase(heap, page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
   mi_page_set_heap(page, heap);
-  page->block_size = block_size;
   size_t page_size;
   uint8_t* page_start = mi_page_area(page, &page_size);
   mi_track_mem_noaccess(page_start,page_size);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
   mi_assert_internal(page->reserved > 0);
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  page->free_is_zero = page->memid.initially_zero;
   #if MI_DEBUG>2
   if (page->memid.initially_zero) {
     mi_track_mem_defined(page->page_start, page_size);
     mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)mi_ctz(block_size);
-  }
-  else {
-    page->block_size_shift = 0;
-  }
-
+  
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -705,11 +693,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  mi_page_extend_free(heap,page);
   mi_assert(mi_page_immediate_available(page));
 }
 

From 68f5fb2f4b857681f80789ac0902bb39535bd072 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 15:08:06 -0800
Subject: [PATCH 004/264] wip: further progress on segment removal; arena
 allocation

---
 ide/vs2022/mimalloc-override.vcxproj         |  1 -
 ide/vs2022/mimalloc-override.vcxproj.filters |  3 --
 ide/vs2022/mimalloc.vcxproj                  |  7 ++-
 ide/vs2022/mimalloc.vcxproj.filters          |  6 +--
 include/mimalloc/bits.h                      |  2 +-
 include/mimalloc/internal.h                  |  6 ++-
 src/alloc-aligned.c                          |  4 +-
 src/arena.c                                  | 16 +++---
 src/bitmap.c                                 |  6 +--
 src/free.c                                   |  2 +-
 src/heap.c                                   | 23 ++++----
 src/init.c                                   | 57 +++++++-------------
 src/os.c                                     |  9 ++--
 src/page-map.c                               |  2 +-
 src/page-queue.c                             | 10 ++--
 src/page.c                                   |  9 ++--
 src/prim/windows/prim.c                      |  6 +--
 src/stats.c                                  |  9 ++--
 18 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj
index 4383d886..32bd97d1 100644
--- a/ide/vs2022/mimalloc-override.vcxproj
+++ b/ide/vs2022/mimalloc-override.vcxproj
@@ -265,7 +265,6 @@
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
     <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
     <ClCompile Include="..\..\src\stats.c" />
   </ItemGroup>
   <ItemGroup>
diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters
index a9f66c35..6656c16d 100644
--- a/ide/vs2022/mimalloc-override.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override.vcxproj.filters
@@ -46,9 +46,6 @@
     <ClCompile Include="..\..\src\random.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\segment-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 3dd7326f..41fe0b46 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -214,6 +214,12 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena-page.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
@@ -258,7 +264,6 @@
     <ClInclude Include="..\..\include\mimalloc\track.h" />
     <ClInclude Include="..\..\include\mimalloc\types.h" />
     <ClInclude Include="..\..\src\bitmap.h" />
-    <ClInclude Include="..\..\src\xbitmap.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 2eed7e90..237ef1ed 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,6 +58,9 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\arena-page.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
@@ -87,9 +90,6 @@
     <ClInclude Include="..\..\include\mimalloc\prim.h">
       <Filter>Headers</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\src\xbitmap.h">
-      <Filter>Headers</Filter>
-    </ClInclude>
     <ClInclude Include="..\..\include\mimalloc\bits.h">
       <Filter>Headers</Filter>
     </ClInclude>
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index ad7ea3e6..d6695a00 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -282,7 +282,7 @@ static inline size_t mi_rotr(size_t x, size_t r) {
   #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
     return mi_builtin(rotateright32)(x,r);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    #if MI_BFIELD_SIZE==4
+    #if MI_SIZE_BITS==32
     return _lrotr(x,(int)r);
     #else
     return _rotr64(x,(int)r);
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index d60b0c15..515acfc1 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -140,6 +140,8 @@ void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
 void       _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld);
 void       _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld);
+bool       _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page);
+void       _mi_arena_reclaim_all_abandoned(mi_heap_t* heap);
 
 void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
@@ -567,11 +569,11 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
   return (page->reserved - page->used <= frac);
 }
 
-static inline bool mi_page_is_abandoned(mi_page_t* page) {
+static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   return (mi_page_thread_id(page) == 0);
 }
 
-static inline bool mi_page_is_huge(mi_page_t* page) {
+static inline bool mi_page_is_huge(const mi_page_t* page) {
   return (page->block_size > MI_LARGE_MAX_OBJ_SIZE);
 }
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index b4da4ded..43dc2d36 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -16,12 +16,12 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 
 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // objects up to `MI_PAGE_ALIGN` are allocated aligned to their size
   mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
   if (alignment > size) return false;
   if (alignment <= MI_MAX_ALIGN_SIZE) return true;
   const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+  return (bsize <= MI_PAGE_ALIGN && (bsize & (alignment-1)) == 0);
 }
 
 #if MI_GUARDED
diff --git a/src/arena.c b/src/arena.c
index c9f8400b..0db8acf3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -298,7 +298,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
   for (size_t i = 0; i < _max_arena; i++) { \
     size_t _idx = i + _start; \
     if (_idx >= _max_arena) { _idx -= _max_arena; } \
-    const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); \
+    const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\
     mi_arena_t* const   var_arena = mi_arena_from_index(_idx); \
     if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
     {
@@ -341,6 +341,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
   mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS);
   mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
 
+  // try to find free blocks in the arena's
   void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
   if (p != NULL) return p;
 
@@ -354,6 +355,8 @@ static mi_decl_noinline void* mi_arena_try_alloc(
       if (p != NULL) return p;
     }
   }
+
+  return NULL;
 }
 
 // Allocate from the OS (if allowed)
@@ -445,7 +448,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl
     }
   }
   mi_forall_arenas_end();
-  return false;
+  return NULL;
 }
 
 static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
@@ -455,7 +458,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
   const size_t alignment = MI_ARENA_BLOCK_ALIGN;
 
   // try to allocate from free space in arena's
-  mi_memid_t memid;
+  mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
   if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
     page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
@@ -472,8 +475,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
   _mi_memzero_aligned(page, sizeof(*page));
   mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
   const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size;
-  mi_assert_internal(reserved > 0 && reserved < UINT16_MAX);
-  page->reserved = reserved;
+  mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
+  page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
   page->block_size = block_size;
   page->memid = memid;
@@ -493,7 +496,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
 // block_count: arena block count for the page
 // block size : page block size
 static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) {
-  const size_t    req_arena_id = heap->arena_id;
+  const mi_arena_id_t  req_arena_id = heap->arena_id;
   mi_tld_t* const tld = heap->tld;
 
   // 1. look for an abandoned page  
@@ -515,6 +518,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size
 
 
 static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+  MI_UNUSED(heap); MI_UNUSED(block_size); MI_UNUSED(page_alignment);
   _mi_error_message(EINVAL, "singleton page is not yet implemented\n");
   return NULL;
 }
diff --git a/src/bitmap.c b/src/bitmap.c
index 9faa9ae9..24c0d9c9 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -149,7 +149,7 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
     const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    bool already_xset;
+    bool already_xset = false;
     all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
     all_already_xset = all_already_xset && already_xset;
     // next field
@@ -268,7 +268,6 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk,
     // try again
   }
   #else
-  size_t idx;
   for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     size_t idx;
     if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
@@ -306,7 +305,6 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
     // try again
   }
   #else
-    size_t idx;
     for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
       const mi_bfield_t x = chunk->bfields[i];
       // has_set8 has low bit in each byte set if the byte in x == 0xFF
@@ -374,7 +372,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
 
 
 // are all bits in a bitmap chunk set?
-static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
+static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
   return _mm256_test_all_ones(vec);
diff --git a/src/free.c b/src/free.c
index e1cc9276..224070fe 100644
--- a/src/free.c
+++ b/src/free.c
@@ -233,7 +233,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
       mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
   {
     // the page is abandoned, try to reclaim it into our heap
-    if (_mi_heap_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
+    if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
       mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
       // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
diff --git a/src/heap.c b/src/heap.c
index e4955ba7..8ee66055 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -54,9 +54,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   MI_UNUSED(arg1);
   MI_UNUSED(arg2);
   MI_UNUSED(pq);
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_internal(mi_page_heap(page) == heap);  
   mi_assert_expensive(_mi_page_is_valid(page));
   return true;
 }
@@ -135,7 +133,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
     // note: this only collects in the current subprocess
-    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
+    _mi_arena_reclaim_all_abandoned(heap);
   }
 
   // if abandoning, mark all pages to no longer add to delayed_free
@@ -155,7 +153,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
   // collect segments (purge pages, this can be expensive so don't force on abandonment)
-  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
+  // _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
 
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && is_main_thread && mi_heap_is_backing(heap)) {
@@ -320,13 +318,13 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 
   // stats
   const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, huge, bsize);
   }
 #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
   const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize * inuse);
 #if (MI_STAT>1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
@@ -343,7 +341,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   // mi_page_free(page,false);
   page->next = NULL;
   page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+  _mi_arena_page_free(page,heap->tld);
 
   return true; // keep going
 }
@@ -483,11 +481,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 // static since it is not thread safe to access heaps from other threads.
 static mi_heap_t* mi_heap_of_block(const void* p) {
   if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(valid);
-  if mi_unlikely(!valid) return NULL;
-  return mi_page_heap(_mi_segment_page_of(segment,p));
+  mi_page_t* page = _mi_ptr_page(p); // TODO: check pointer validity?
+  return mi_page_heap(page);
 }
 
 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
@@ -562,7 +557,7 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
   if (page->used == 0) return true;
 
   size_t psize;
-  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* const pstart = mi_page_area(page, &psize);
   mi_heap_t* const heap = mi_page_heap(page);
   const size_t bsize    = mi_page_block_size(page);
   const size_t ubsize   = mi_page_usable_block_size(page); // without padding
diff --git a/src/init.c b/src/init.c
index 2544f097..215d6be8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,8 +14,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0,
-  false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
   { 0 },   // flags
@@ -33,10 +31,9 @@ const mi_page_t _mi_page_empty = {
   #endif
   MI_ATOMIC_VAR_INIT(0), // xthread_free
   MI_ATOMIC_VAR_INIT(0), // xheap
-  NULL, NULL
-  #if MI_INTPTR_SIZE==4
-  , { NULL }
-  #endif
+  MI_ATOMIC_VAR_INIT(0), // xthread_id
+  NULL, NULL, // next, prev
+  { { NULL, 0}, false, false, false, MI_MEM_NONE }  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -63,8 +60,8 @@ const mi_page_t _mi_page_empty = {
     QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
     QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
     QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }
 
 #define MI_STAT_COUNT_NULL()  {0,0,0,0}
 
@@ -82,8 +79,6 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
@@ -101,10 +96,10 @@ const mi_page_t _mi_page_empty = {
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // tid
+  MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free
+  0,                // thread_id
+  0,                // arena_id
   0,                // cookie
-  0,                // arena id
   { 0, 0 },         // keys
   { {0}, {0}, 0, true }, // random
   0,                // page count
@@ -124,17 +119,6 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
 }
 
-// Thread sequence number
-static _Atomic(size_t)        mi_tcount;
-static mi_decl_thread size_t  mi_tseq;
-
-size_t _mi_thread_seq_id(void) mi_attr_noexcept {
-  size_t tseq = mi_tseq;
-  if (tseq == 0) {
-    mi_tseq = tseq = mi_atomic_add_acq_rel(&mi_tcount,1);
-  }
-  return tseq;
-}
 
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
@@ -146,12 +130,10 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default;
 static mi_decl_cache_align mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0, &mi_subproc_default,
-    &tld_main.stats, &tld_main.os
-  }, // segments
+  NULL, // subproc
+  0,    // tseq
   { 0, &tld_main.stats },  // os
-  { MI_STATS_NULL }       // stats
+  { MI_STATS_NULL }        // stats
 };
 
 mi_decl_cache_align mi_heap_t _mi_heap_main = {
@@ -287,9 +269,9 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
   mi_heap_t* heap = mi_heap_get_default();
   if (heap == NULL) return;
-  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
-  if (heap->tld->segments.subproc != &mi_subproc_default) return;
-  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
+  mi_assert(heap->tld->subproc == &mi_subproc_default);
+  if (heap->tld->subproc != &mi_subproc_default) return;
+  heap->tld->subproc = _mi_subproc_from_id(subproc_id);
 }
 
 
@@ -405,14 +387,16 @@ static bool _mi_thread_heap_init(void) {
   return false;
 }
 
+// Thread sequence number
+static _Atomic(size_t) mi_tcount;
+
 // initialize thread local data
 void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   _mi_memzero_aligned(tld,sizeof(mi_tld_t));
   tld->heap_backing = bheap;
   tld->heaps = NULL;
-  tld->segments.subproc = &mi_subproc_default;
-  tld->segments.stats = &tld->stats;
-  tld->segments.os = &tld->os;
+  tld->subproc = &mi_subproc_default;
+  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
@@ -449,8 +433,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   _mi_stats_done(&heap->tld->stats);
 
   // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+  if (heap != &_mi_heap_main) {    
     mi_thread_data_free((mi_thread_data_t*)heap);
   }
   else {
diff --git a/src/os.c b/src/os.c
index 83521766..da41d152 100644
--- a/src/os.c
+++ b/src/os.c
@@ -245,7 +245,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
       // this is handled though by having the `base` field in the memid's
       *base = p; // remember the base
-      p = mi_align_up_ptr(p, alignment);
+      p = _mi_align_up_ptr(p, alignment);
 
       // explicitly commit only the aligned part
       if (commit) {
@@ -258,7 +258,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (p == NULL) return NULL;
 
       // and selectively unmap parts around the over-allocated area.
-      void* aligned_p = mi_align_up_ptr(p, alignment);
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
       size_t mid_size = _mi_align_up(size, _mi_os_page_size());
       size_t post_size = over_size - pre_size - mid_size;
@@ -316,6 +316,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 }
 
 void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+  MI_UNUSED(stats);
   void* p = _mi_os_alloc(size, memid, &_mi_stats_main);
   if (p == NULL) return NULL;
 
@@ -373,10 +374,10 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
   if (size == 0 || addr == NULL) return NULL;
 
   // page align conservatively within the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
     : mi_align_down_ptr(addr, _mi_os_page_size()));
   void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+    : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
   ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
   if (diff <= 0) return NULL;
 
diff --git a/src/page-map.c b/src/page-map.c
index cb527886..d70c3ee6 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -74,7 +74,7 @@ void _mi_page_map_register(mi_page_t* page) {
   }
 
   // set the offsets
-  for (int i = 0; i < block_count; i++) {
+  for (int i = 0; i < (int)block_count; i++) {
     mi_assert_internal(i < 128);
     _mi_page_map[idx + i] = (signed char)(-i-1);
   }
diff --git a/src/page-queue.c b/src/page-queue.c
index 0a791adb..c6b19985 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }
 
 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }
 
 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
 }
 
 /* -----------------------------------------------------------
@@ -76,7 +76,7 @@ static inline uint8_t mi_bin(size_t size) {
     bin = (uint8_t)wsize;
   }
   #endif
-  else if (wsize > MI_LARGE_OBJ_WSIZE_MAX) {
+  else if (wsize > MI_LARGE_MAX_OBJ_WSIZE) {
     bin = MI_BIN_HUGE;
   }
   else {
@@ -113,7 +113,7 @@ size_t _mi_bin_size(uint8_t bin) {
 
 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
     return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
diff --git a/src/page.c b/src/page.c
index fa006085..122b4324 100644
--- a/src/page.c
+++ b/src/page.c
@@ -36,8 +36,8 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
   return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }
 
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+//static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
@@ -83,7 +83,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->capacity <= page->reserved);
 
   // const size_t bsize = mi_page_block_size(page);
-  uint8_t* start = mi_page_start(page);
+  // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -414,6 +414,7 @@ void _mi_page_force_abandon(mi_page_t* page) {
 
 // Free a page with no more free blocks
 void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  MI_UNUSED(force);
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(pq == mi_page_queue_of(page));
@@ -784,7 +785,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   }
   if (page != NULL && !mi_page_immediate_available(page)) {
     mi_assert_internal(mi_page_is_expandable(page));
-    mi_page_extend_free(heap, page, heap->tld);
+    mi_page_extend_free(heap, page);
   }
 
   if (page == NULL) {
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 1d3d6f41..418c950f 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -127,7 +127,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   ULONGLONG memInKiB = 0;
   if (GetPhysicallyInstalledSystemMemory(&memInKiB)) {
     if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
-      config->physical_memory = memInKiB * MI_KiB;
+      config->physical_memory = (size_t)(memInKiB * MI_KiB);
     }
   }
   // get the VirtualAlloc2 function
@@ -175,7 +175,7 @@ int _mi_prim_free(void* addr, size_t size ) {
     // the start of the region.
     MEMORY_BASIC_INFORMATION info = { 0 };
     VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
       errcode = 0;
       err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
       if (err) { errcode = GetLastError(); }
@@ -239,7 +239,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
       // success, return the address
       return p;
     }
-    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
               (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
               win_is_out_of_memory_error(GetLastError())) {
       // if committing regular memory and being out-of-memory,
diff --git a/src/stats.c b/src/stats.c
index 29376ace..14489937 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -90,7 +90,6 @@ static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   if (stats==src) return;
-  mi_stat_add(&stats->segments, &src->segments,1);
   mi_stat_add(&stats->pages, &src->pages,1);
   mi_stat_add(&stats->reserved, &src->reserved, 1);
   mi_stat_add(&stats->committed, &src->committed, 1);
@@ -99,11 +98,9 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_add(&stats->page_committed, &src->page_committed, 1);
 
   mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
-  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
   mi_stat_add(&stats->threads, &src->threads, 1);
 
   mi_stat_add(&stats->malloc, &src->malloc, 1);
-  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
   mi_stat_add(&stats->normal, &src->normal, 1);
   mi_stat_add(&stats->huge, &src->huge, 1);
   mi_stat_add(&stats->giant, &src->giant, 1);
@@ -329,9 +326,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
   mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
   mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  //mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
   mi_stat_print(&stats->pages, "pages", -1, out, arg);
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);

From 9603fe8b50121dd0ee8b4f9748faba9e749569bf Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 16:27:58 -0800
Subject: [PATCH 005/264] can compile without missing functions

---
 ide/vs2022/mimalloc-override.vcxproj         |   8 +-
 ide/vs2022/mimalloc-override.vcxproj.filters |   8 +-
 ide/vs2022/mimalloc.vcxproj                  |   6 -
 ide/vs2022/mimalloc.vcxproj.filters          |   3 -
 include/mimalloc/internal.h                  |   6 +-
 src/arena.c                                  | 971 ++-----------------
 src/page-map.c                               |  39 +-
 7 files changed, 145 insertions(+), 896 deletions(-)

diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj
index 32bd97d1..a5d5c34c 100644
--- a/ide/vs2022/mimalloc-override.vcxproj
+++ b/ide/vs2022/mimalloc-override.vcxproj
@@ -236,17 +236,18 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.c" />
+    <ClCompile Include="..\..\src\free.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
     <ClCompile Include="..\..\src\prim\prim.c" />
     <ClCompile Include="..\..\src\prim\windows\prim.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@@ -264,7 +265,6 @@
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
     <ClCompile Include="..\..\src\stats.c" />
   </ItemGroup>
   <ItemGroup>
diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters
index 6656c16d..60c7a1fb 100644
--- a/ide/vs2022/mimalloc-override.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override.vcxproj.filters
@@ -46,16 +46,16 @@
     <ClCompile Include="..\..\src\random.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\stats.c">
       <Filter>Sources</Filter>
     </ClCompile>
     <ClCompile Include="..\..\src\libc.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
   </ItemGroup>
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 41fe0b46..8606faf3 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -214,12 +214,6 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-page.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 237ef1ed..a47efddd 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,9 +58,6 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-page.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 515acfc1..3c8216ec 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -130,8 +130,8 @@ void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t ma
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
 void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
-void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
 bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
 bool       _mi_arena_contains(const void* p);
 void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
@@ -503,7 +503,7 @@ static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
   if (heap != NULL) { 
-    page->heap_tag  = heap->tag; 
+    page->heap_tag = heap->tag; 
     mi_atomic_store_release(&page->xthread_id, heap->thread_id);
   }
   else {
diff --git a/src/arena.c b/src/arena.c
index 0db8acf3..9dbf73d6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -155,15 +155,26 @@ static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, siz
 }
 
 // returns if the arena is exclusive
-bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
+static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
   mi_assert_internal(memid.memkind == MI_MEM_ARENA);
   *arena_index = mi_arena_id_index(memid.mem.arena.id);
   *block_index = memid.mem.arena.block_index;
   return memid.mem.arena.is_exclusive;
 }
 
+// get the arena and block index
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index) {
+  size_t arena_index;
+  mi_arena_memid_indices(memid, &arena_index, block_index);
+  return mi_arena_from_index(arena_index);
+}
 
 
+static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index) {
+  // todo: maybe store the arena* directly in the page?
+  return mi_arena_from_memid(page->memid, block_index);
+}
+
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
@@ -407,7 +418,7 @@ void* _mi_arena_alloc_aligned(
   return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
   return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
@@ -546,6 +557,95 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
   return page;
 }
 
+static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
+  // todo: record real allocated size instead of trying to recalculate?
+  size_t page_size;
+  uint8_t* const pstart = mi_page_area(page, &page_size);
+  const size_t diff = pstart - (uint8_t*)page;
+  const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE);
+  if (psize != NULL) { *psize = size;  }
+  return pstart;
+}
+
+void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
+  size_t size;
+  uint8_t* pstart = mi_arena_page_allocated_area(page, &size);
+  _mi_arena_free(pstart, size, size, page->memid, &tld->stats);
+}
+
+/* -----------------------------------------------------------
+  Arena abandon
+----------------------------------------------------------- */
+
+void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(page->next==NULL);
+  
+  if (mi_page_all_free(page)) {
+    _mi_arena_page_free(page, tld);
+  }
+  else if (mi_page_is_full(page)) {  // includes singleton pages
+    // leave as is; it will be reclaimed on free
+  }
+  else if (mi_memkind_is_os(page->memid.memkind)) {
+    _mi_error_message(EINVAL, "implement page abandon for OS allocated pages\n");
+    // leave as is; it will be reclaimed on the first free
+  }
+  else if (page->memid.memkind==MI_MEM_ARENA) {
+    size_t size;
+    mi_arena_page_allocated_area(page, &size);
+    size_t bin      = _mi_bin(mi_page_block_size(page));
+    size_t block_index;
+    mi_arena_t* arena = mi_page_arena(page, &block_index);
+    bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL);
+    MI_UNUSED(were_zero); mi_assert_internal(were_zero);
+    mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]);
+  }
+  else {
+    _mi_error_message(EINVAL, "implement page abandon for external allocated pages\n");
+    // leave as is; it will be reclaimed on the first free
+  }
+}
+
+bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned
+  mi_memid_t memid = page->memid;
+  if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
+
+  if mi_likely(memid.memkind == MI_MEM_ARENA) {
+    size_t block_index;
+    mi_arena_t* arena = mi_page_arena(page, &block_index);
+    if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
+
+    // don't reclaim more from a `free` call than half the current segments
+    // this is to prevent a pure free-ing thread to start owning too many segments
+    // (but not for out-of-arena segments as that is the main way to be reclaimed for those)
+    // if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) {
+    //  return false;
+    // }
+    const size_t bin = _mi_bin(page->block_size);
+    if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->blocks_abandoned[bin], block_index, 1)) {
+      // we got it atomically
+      _mi_page_reclaim(heap, page);
+      mi_assert_internal(!mi_page_is_abandoned(page));
+      return true;
+    }
+  }
+  else {
+    _mi_warning_message("implement reclaim for OS allocated pages\n");
+  }
+
+
+  return false;
+}
+
+void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+  // TODO: implement this
+  return;
+}
+
 
 /* -----------------------------------------------------------
   Arena free
@@ -1017,97 +1117,15 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
 }
 
 
-#if 0
-
-#define MI_IN_ARENA_C
-#include "arena-abandon.c"
-#undef MI_IN_ARENA_C
-
-/* -----------------------------------------------------------
-  Arena id's
-  id = arena_index + 1
------------------------------------------------------------ */
-
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
-}
-
-mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
-}
-
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
-}
-
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
-  if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
-  }
-  else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
-  }
-}
-
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
-}
-
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
-}
-
-
-/* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the
-  lower 8 bits are the arena id, and the upper bits the block index.
------------------------------------------------------------ */
-
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
-}
-
-static size_t mi_size_of_blocks(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
-}
-
-static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
-}
-
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
-  memid.mem.arena.block_index = bitmap_index;
-  memid.mem.arena.is_exclusive = is_exclusive;
-  return memid;
-}
-
-bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *bitmap_index = memid.mem.arena.block_index;
-  return memid.mem.arena.is_exclusive;
-}
-
-
-
 /* -----------------------------------------------------------
   Special static area for mimalloc internal structures
-  to avoid OS calls (for example, for the arena metadata (~= 256b))
+  to avoid OS calls (for example, for the subproc metadata (~= 721b))
 ----------------------------------------------------------- */
 
 #define MI_ARENA_STATIC_MAX  ((MI_INTPTR_SIZE/2)*MI_KiB)  // 4 KiB on 64-bit
 
 static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
-static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
+static mi_decl_cache_align _Atomic(size_t)mi_arena_static_top;
 
 static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
   *memid = _mi_memid_none();
@@ -1164,784 +1182,9 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
   }
 }
 
-void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
-}
-
-
-/* -----------------------------------------------------------
-  Thread safe allocation in an arena
------------------------------------------------------------ */
-
-// claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
-{
-  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
-    return true;
-  };
+bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
+  _mi_error_message(EINVAL, "implement mi_abandon_visit_blocks\n");
   return false;
 }
 
-
-/* -----------------------------------------------------------
-  Arena Allocation
------------------------------------------------------------ */
-
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
-  mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
-
-  // claimed it!
-  void* p = mi_arena_block_start(arena, bitmap_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
-  memid->is_pinned = arena->memid.is_pinned;
-
-  // none of the claimed blocks should be scheduled for a decommit
-  if (arena->blocks_purge != NULL) {
-    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
-    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  // set the dirty bits (todo: no need for an atomic op here?)
-  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
-    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  }
-
-  // set commit state
-  if (arena->blocks_committed == NULL) {
-    // always committed
-    memid->initially_committed = true;
-  }
-  else if (commit) {
-    // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->initially_committed = true;
-    bool any_uncommitted;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
-    if (any_uncommitted) {
-      bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
-        memid->initially_committed = false;
-      }
-      else {
-        if (commit_zero) { memid->initially_zero = true; }
-      }
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  return p;
-}
-
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-                    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-                                                  bool commit, bool allow_large,
-                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
-
-// try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
-{
-  if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
-  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
-
-  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
-  if (arena_reserve == 0) return false;
-
-  if (!_mi_os_has_virtual_reserve()) {
-    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
-  }
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
-  arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
-  if (arena_count >= 8 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 );
-    size_t reserve = 0;
-    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
-      arena_reserve = reserve;
-    }
-  }
-  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-
-  // commit eagerly?
-  bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
-  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
-
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
-}
-
-
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = _mi_memid_none();
-
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
-
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
-    }
-  }
-
-  // if we cannot use OS allocation, return NULL
-  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
-    errno = ENOMEM;
-    return NULL;
-  }
-
-  // finally, fall back to the OS
-  if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
-  }
-  else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }
-}
-
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
-}
-
-
-void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
-  if (size != NULL) *size = 0;
-  size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
-  return arena->start;
-}
-
-
-/* -----------------------------------------------------------
-  Arena purge
------------------------------------------------------------ */
-
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
-
-// reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_committed != NULL);
-  mi_assert_internal(arena->blocks_purge != NULL);
-  mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx);
-  bool needs_recommit;
-  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
-    // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
-  }
-  else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
-  }
-
-  // clear the purged blocks
-  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
-  // update committed bitmap
-  if (needs_recommit) {
-    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
-  }
-}
-
-// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
-// Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_purge != NULL);
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0) return;  // is purging allowed at all?
-
-  if (_mi_preloading() || delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);
-  }
-  else {
-    // schedule decommit
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
-    }
-    else {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
-    }
-    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
-  }
-}
-
-// purge a range of blocks
-// return true if the full range was purged.
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startseqx + bitlen;
-  size_t bitseqx = startseqx;
-  bool all_purged = false;
-  while (bitseqx < endidx) {
-    // count consecutive ones in the purge mask
-    size_t count = 0;
-    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
-      count++;
-    }
-    if (count > 0) {
-      // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
-      mi_arena_purge(arena, range_idx, count, stats);
-      if (count == bitlen) {
-        all_purged = true;
-      }
-    }
-    bitseqx += (count+1); // +1 to skip the zero bit (or end)
-  }
-  return all_purged;
-}
-
-// returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
-{
-  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
-  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-  if (expire == 0) return false;
-  if (!force && expire > now) return false;
-
-  // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
-
-  // potential purges scheduled, walk through the bitmap
-  bool any_purged = false;
-  bool full_purge = true;
-  for (size_t i = 0; i < arena->field_count; i++) {
-    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
-    if (purge != 0) {
-      size_t bitseqx = 0;
-      while (bitseqx < MI_BITMAP_FIELD_BITS) {
-        // find consecutive range of ones in the purge mask
-        size_t bitlen = 0;
-        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
-          bitlen++;
-        }
-        // temporarily claim the purge range as "in-use" to be thread-safe with allocation
-        // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
-        while( bitlen > 0 ) {
-          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
-            break;
-          }
-          bitlen--;
-        }
-        // actual claimed bits at `in_use`
-        if (bitlen > 0) {
-          // read purge again now that we have the in_use bits
-          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
-            full_purge = false;
-          }
-          any_purged = true;
-          // release the claimed `in_use` bits again
-          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
-        }
-        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitseqx
-    } // purge != 0
-  }
-  // if not fully purged, make sure to purge again in the future
-  if (!full_purge) {
-    const long delay = mi_arena_purge_delay();
-    mi_msecs_t expected = 0;
-    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
-  }
-  return any_purged;
-}
-
-static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
-
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
-  if (max_arena == 0) return;
-
-  // allow only one thread to purge at a time
-  static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard)
-  {
-    mi_msecs_t now = _mi_clock_now();
-    size_t max_purge_count = (visit_all ? max_arena : 1);
-    for (size_t i = 0; i < max_arena; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-      if (arena != NULL) {
-        if (mi_arena_try_purge(arena, now, force, stats)) {
-          if (max_purge_count <= 1) break;
-          max_purge_count--;
-        }
-      }
-    }
-  }
-}
-
-
-/* -----------------------------------------------------------
-  Arena free
------------------------------------------------------------ */
-
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
-  mi_assert_internal(committed_size <= size);
-  if (p==NULL) return;
-  if (size==0) return;
-  const bool all_committed = (committed_size == size);
-
-  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-  mi_track_mem_undefined(p,size);
-
-  if (mi_memkind_is_os(memid.memkind)) {
-    // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-    }
-    _mi_os_free(p, size, memid, stats);
-  }
-  else if (memid.memkind == MI_MEM_ARENA) {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t bitmap_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
-    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-
-    // potentially decommit
-    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
-      mi_assert_internal(all_committed);
-    }
-    else {
-      mi_assert_internal(arena->blocks_committed != NULL);
-      mi_assert_internal(arena->blocks_purge != NULL);
-
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
-        mi_track_mem_noaccess(p,size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
-    }
-
-    // and make it available to others again
-    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
-      return;
-    };
-  }
-  else {
-    // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
-  }
-
-  // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t new_max_arena = 0;
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL) {
-      mi_lock_done(&arena->abandoned_visit_lock);
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
-      }
-      else {
-        new_max_arena = i;
-      }
-      _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size);
-    }
-  }
-
-  // try to lower the max arena.
-  size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
-}
-
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
-  mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
-}
-
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/* -----------------------------------------------------------
-  Add an arena.
------------------------------------------------------------ */
-
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
-  mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
-  mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
-
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
-  }
-  _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
-  return true;
-}
-
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
-{
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  if (size < MI_ARENA_BLOCK_SIZE) return false;
-
-  if (is_large) {
-    mi_assert_internal(memid.initially_committed && memid.is_pinned);
-  }
-
-  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
-  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
-  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_memid_t meta_memid;
-  mi_arena_t* arena   = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid);
-  if (arena == NULL) return false;
-
-  // already zero'd due to zalloc
-  // _mi_memzero(arena, asize);
-  arena->id = _mi_arena_id_none();
-  arena->memid = memid;
-  arena->exclusive = exclusive;
-  arena->meta_size = asize;
-  arena->meta_memid = meta_memid;
-  arena->block_count = bcount;
-  arena->field_count = fields;
-  arena->start = (uint8_t*)start;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
-  arena->purge_expire = 0;
-  arena->search_idx   = 0;
-  mi_lock_init(&arena->abandoned_visit_lock);
-  // consecutive bitmaps
-  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
-  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
-  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
-  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
-  // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
-    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
-  }
-
-  // and claim leftover blocks if needed (so we never allocate there)
-  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
-  mi_assert_internal(post >= 0);
-  if (post > 0) {
-    // don't use leftover bits at the end
-    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
-  }
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
-
-}
-
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
-  memid.initially_committed = is_committed;
-  memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
-  mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
-  if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
-    return ENOMEM;
-  }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
-  return 0;
-}
-
-
-// Manage a range of regular OS memory
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
-  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
-  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
-}
-
-
-/* -----------------------------------------------------------
-  Debugging
------------------------------------------------------------ */
-
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
-  size_t bcount = 0;
-  size_t inuse_count = 0;
-  for (size_t i = 0; i < field_count; i++) {
-    char buf[MI_BITMAP_FIELD_BITS + 1];
-    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
-    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
-      if (bcount < block_count) {
-        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
-        if (inuse) inuse_count++;
-        buf[bit] = (inuse ? 'x' : '.');
-      }
-      else {
-        buf[bit] = ' ';
-      }
-    }
-    buf[MI_BITMAP_FIELD_BITS] = 0;
-    _mi_verbose_message("%s  %s\n", prefix, buf);
-  }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
-  return inuse_count;
-}
-
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t inuse_total = 0;
-  size_t abandoned_total = 0;
-  size_t purge_total = 0;
-  for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena == NULL) break;
-    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
-    if (show_inuse) {
-      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
-    }
-    if (arena->blocks_committed != NULL) {
-      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
-    }
-    if (show_abandoned) {
-      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);
-    }
-    if (show_purge && arena->blocks_purge != NULL) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
-    }
-  }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
-  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
-}
-
-
-/* -----------------------------------------------------------
-  Reserve a huge page arena.
------------------------------------------------------------ */
-// reserve at a specific numa node
-int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
-  if (pages==0) return 0;
-  if (numa_node < -1) numa_node = -1;
-  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  size_t hsize = 0;
-  size_t pages_reserved = 0;
-  mi_memid_t memid;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
-  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
-    return ENOMEM;
-  }
-  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
-
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
-    return ENOMEM;
-  }
-  return 0;
-}
-
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
-  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
-}
-
-// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages == 0) return 0;
-
-  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
-  const size_t pages_per = pages / numa_count;
-  const size_t pages_mod = pages % numa_count;
-  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
-
-  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
-    if (err) return err;
-    if (pages < node_pages) {
-      pages = 0;
-    }
-    else {
-      pages -= node_pages;
-    }
-  }
-
-  return 0;
-}
-
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  MI_UNUSED(max_secs);
-  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
-  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
-  return err;
-}
-
-
-#endif
\ No newline at end of file
diff --git a/src/page-map.c b/src/page-map.c
index d70c3ee6..dc0145f2 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -42,6 +42,21 @@ static bool mi_page_map_init(void) {
   return true;
 }
 
+static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) {
+  // is the page map area that contains the page address committed?
+  if (!mi_page_map_all_committed) {
+    const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit);
+    const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit;
+    for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
+        // this may race, in which case we do multiple commits (which is ok)
+        _mi_os_commit((uint8_t*)p + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL);
+        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
+      }
+    }
+  }
+}
+
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) {
   size_t page_size;
   *page_start = mi_page_area(page, &page_size);
@@ -60,18 +75,7 @@ void _mi_page_map_register(mi_page_t* page) {
   size_t   block_count;
   const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
   
-  // is the page map area that contains the page address committed?
-  if (!mi_page_map_all_committed) {
-    const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit);
-    const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit;
-    for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
-      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
-        // this may race, in which case we do multiple commits (which is ok)
-        _mi_os_commit(page_start + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL);
-        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
-      }
-    }
-  }
+  mi_page_map_ensure_committed(page, idx, block_count);
 
   // set the offsets
   for (int i = 0; i < (int)block_count; i++) {
@@ -92,3 +96,14 @@ void _mi_page_map_unregister(mi_page_t* page) {
   // unset the offsets
   _mi_memzero(_mi_page_map + idx, block_count);
 }
+
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT);
+  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_blocks_per_commit_bit, 1)) {
+    return (_mi_page_map[idx] != 0);
+  }
+  else {
+    return false;
+  }
+}
\ No newline at end of file

From e0152ab82fbe0d8a94d1068fdaed2947d3900284 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 16:58:52 -0800
Subject: [PATCH 006/264] wip: update any_set

---
 src/bitmap.c | 70 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 24c0d9c9..5ac4ca08 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -76,7 +76,6 @@ static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, siz
   return mi_bfield_atomic_xset(set, b, idx);
 }
 
-
 // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
 // and false otherwise (leaving the bit field as is).
 static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
@@ -97,6 +96,15 @@ static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b
   }
 }
 
+// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_xset_mask(set, b, mask);
+}
+
+
 // Check if all bits corresponding to a mask are set/cleared.
 static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
@@ -108,12 +116,11 @@ static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b,
   }
 }
 
-// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_xset_mask(set,b,mask);
+// Check if a bit is set/clear
+static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  return mi_bfield_atomic_is_xset_mask(set, b, mask);
 }
 
 
@@ -404,6 +411,17 @@ static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
 /* --------------------------------------------------------------------------------
  bitmap
 -------------------------------------------------------------------------------- */
+static void mi_bitmap_update_anyset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  if (set) {
+    mi_bfield_atomic_xset(MI_BIT_SET, &bitmap->any_set, idx);
+  }
+  else { // clear
+    if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[idx])) {
+      mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, idx);
+    }
+  }
+}
+
 // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
 void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
   if (!already_zero) {
@@ -423,6 +441,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
   if (m > n) { m = n; }
   bool already_xset;
   mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
+  mi_bitmap_update_anyset(set, bitmap, chunk_idx);
 
   // n can be large so use memset for efficiency for all in-between chunks
   chunk_idx++;
@@ -430,8 +449,12 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
   const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
   if (mid_chunks > 0) {
     _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
-    chunk_idx += mid_chunks;
-    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
+    const size_t end_chunk = chunk_idx + mid_chunks;
+    while (chunk_idx < end_chunk) {
+      mi_bitmap_update_anyset(set, bitmap, chunk_idx);
+      chunk_idx++;
+    }
+    n -= (mid_chunks * MI_BITMAP_CHUNK_BITS);
   }
 
   // last chunk
@@ -439,6 +462,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
     mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
     mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
     mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
+    mi_bitmap_update_anyset(set, bitmap, chunk_idx);
   }
 }
 
@@ -449,7 +473,9 @@ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
-  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
+  bool ok = mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
+  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
+  return ok;
 }
 
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
@@ -459,7 +485,9 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx%8 == 0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
-  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
+  bool ok = mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
+  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
+  return ok;
 }
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
@@ -475,8 +503,12 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
+
+  bool ok = mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
+  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
+  return ok;
 }
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
@@ -488,13 +520,17 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo
   if (already_xset==NULL) { already_xset = &local_already_xset;  }
   // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
   // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
-
   mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
+
+  const bool allx = mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
+  mi_bitmap_update_anyset(set, bitmap, chunk_idx);
+  return allx;
 }
 
 // Is a sequence of n bits already all set/cleared?
@@ -502,10 +538,13 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);  
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+
   return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
 }
 
@@ -578,6 +617,9 @@ bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, s
   // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
   // TODO: allow spanning across chunk boundaries
   if (n == 0 || n > MI_BFIELD_BITS) return false;
+  if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);
+  if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);
+
   mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;

From 0f635413d678608bd04600faf4e6d558507f7284 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 17:50:37 -0800
Subject: [PATCH 007/264] wip: can run initial test

---
 include/mimalloc/internal.h |  5 +++--
 src/arena.c                 |  9 ++++++---
 src/bitmap.c                |  2 +-
 src/init.c                  |  2 +-
 src/page-map.c              | 20 ++++++++++----------
 test/test-stress.c          |  4 ++++
 6 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3c8216ec..47301e79 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -447,7 +447,7 @@ static inline mi_page_t* _mi_ptr_page(const void* p) {
   #if MI_DEBUG
   if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
   #endif
-  return (mi_page_t*)((up + ofs - 1) << MI_ARENA_BLOCK_SHIFT);
+  return (mi_page_t*)((up + ofs + 1) << MI_ARENA_BLOCK_SHIFT);
 }
 
 
@@ -663,7 +663,8 @@ We also pass a separate `null` value to be used as `NULL` or otherwise
 ------------------------------------------------------------------- */
 
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  return (_mi_ptr_page(p) == _mi_ptr_page(q));
+  // return (_mi_ptr_page(p) == _mi_ptr_page(q));
+  return  ((uintptr_t)p / MI_LARGE_PAGE_SIZE) == ((uintptr_t)q / MI_LARGE_PAGE_SIZE);
 }
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
diff --git a/src/arena.c b/src/arena.c
index 9dbf73d6..a8dff8a5 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -415,7 +415,8 @@ void* _mi_arena_alloc_aligned(
   }
 
   // fall back to the OS
-  return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
+  return p;
 }
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
@@ -498,6 +499,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
   else {
     page->block_size_shift = 0;
   }
+  _mi_page_map_register(page);
 
   mi_assert_internal(mi_page_block_size(page) == block_size);
   mi_assert_internal(mi_page_is_abandoned(page));
@@ -564,12 +566,13 @@ static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
   const size_t diff = pstart - (uint8_t*)page;
   const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE);
   if (psize != NULL) { *psize = size;  }
-  return pstart;
+  return (uint8_t*)page;
 }
 
 void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
   size_t size;
   uint8_t* pstart = mi_arena_page_allocated_area(page, &size);
+  _mi_page_map_unregister(page);
   _mi_arena_free(pstart, size, size, page->memid, &tld->stats);
 }
 
@@ -1110,7 +1113,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
   const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
   if (max_arena == 0) return;
 
-  _mi_error_message(EFAULT, "purging not yet implemented\n");
+  // _mi_error_message(EFAULT, "purging not yet implemented\n");
   MI_UNUSED(stats);
   MI_UNUSED(visit_all);
   MI_UNUSED(force);
diff --git a/src/bitmap.c b/src/bitmap.c
index 5ac4ca08..175bc0ec 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -144,7 +144,7 @@ static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, si
 
 // Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
 static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
   bool all_already_xset = true;
diff --git a/src/init.c b/src/init.c
index 215d6be8..d11f5b5a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -130,7 +130,7 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default;
 static mi_decl_cache_align mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, &_mi_heap_main,
-  NULL, // subproc
+  &mi_subproc_default, // subproc
   0,    // tseq
   { 0, &tld_main.stats },  // os
   { MI_STATS_NULL }        // stats
diff --git a/src/page-map.c b/src/page-map.c
index dc0145f2..8dfd2f26 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 mi_decl_cache_align signed char* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
-static size_t      mi_blocks_per_commit_bit = 1;
+static size_t      mi_size_per_commit_bit = MI_ARENA_BLOCK_SIZE;
 static mi_memid_t  mi_page_map_memid;
 static mi_bitmap_t mi_page_map_commit;
 
@@ -20,13 +20,12 @@ static bool mi_page_map_init(void) {
   if (vbits >= 48) vbits = 47;
   // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
   //                    64 KiB for 4 GiB address space (on 32-bit)
-  const size_t page_map_size = (MI_ZU(1) << (vbits >> MI_ARENA_BLOCK_SHIFT));
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT));
   
-  const size_t min_commit_size = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
-  mi_blocks_per_commit_bit = mi_block_count_of_size(min_commit_size);
+  mi_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
 
   mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
-  _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 0, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
+  _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
     return false;
@@ -38,6 +37,7 @@ static bool mi_page_map_init(void) {
   // commit the first part so NULL pointers get resolved without an access violation
   if (!mi_page_map_all_committed) {
     _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
+    _mi_page_map[0] = -1; // so _mi_ptr_page(NULL) == NULL
   }
   return true;
 }
@@ -45,12 +45,12 @@ static bool mi_page_map_init(void) {
 static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) {
   // is the page map area that contains the page address committed?
   if (!mi_page_map_all_committed) {
-    const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit);
-    const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit;
+    const size_t commit_bit_count = _mi_divide_up(block_count, mi_size_per_commit_bit);
+    const size_t commit_bit_idx = idx / mi_size_per_commit_bit;
     for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
       if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
-        _mi_os_commit((uint8_t*)p + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL);
+        _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_size_per_commit_bit), mi_size_per_commit_bit, NULL, NULL);
         mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
       }
     }
@@ -75,7 +75,7 @@ void _mi_page_map_register(mi_page_t* page) {
   size_t   block_count;
   const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
   
-  mi_page_map_ensure_committed(page, idx, block_count);
+  mi_page_map_ensure_committed(page_start, idx, block_count);
 
   // set the offsets
   for (int i = 0; i < (int)block_count; i++) {
@@ -100,7 +100,7 @@ void _mi_page_map_unregister(mi_page_t* page) {
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT);
-  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_blocks_per_commit_bit, 1)) {
+  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_size_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
   else {
diff --git a/test/test-stress.c b/test/test-stress.c
index 1e70e699..c7288b1a 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,6 +40,10 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
+#elif 1
+static int THREADS = 1;
+static int SCALE   = 10;
+static int ITER    = 10;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 25;      // scaling factor

From 978d844e156b0455bdc39837fade4788f5c34d5e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 20:23:39 -0800
Subject: [PATCH 008/264] wip: bug fixes

---
 include/mimalloc/types.h |   5 +-
 src/arena.c              | 114 ++++++++++++++++++++-------------------
 src/bitmap.c             |   6 +--
 src/options.c            |   2 +-
 src/page.c               |  11 ++--
 test/test-stress.c       |   5 +-
 6 files changed, 78 insertions(+), 65 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 591cb603..e3c0786c 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -171,12 +171,13 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 }
 
 typedef struct mi_memid_os_info {
-  void* base;                       // actual base address of the block (used for offset aligned allocations)
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
   size_t        alignment;          // alignment at allocation
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
+  uint32_t      block_index;        // base index in the arena
+  uint32_t      block_count;        // allocated blocks
   mi_arena_id_t id;                 // arena id (>= 1)
   bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
diff --git a/src/arena.c b/src/arena.c
index a8dff8a5..c5d8b14a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -129,7 +129,7 @@ static uint8_t* mi_arena_start(mi_arena_t* arena) {
 }
 
 // Start of a block
-void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
+uint8_t* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
   return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
 }
 
@@ -146,35 +146,40 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
 
 
 // Create an arena memid
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index, size_t block_count) {
+  mi_assert_internal(block_index < UINT32_MAX); 
+  mi_assert_internal(block_count < UINT32_MAX);
   mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
   memid.mem.arena.id = id;
-  memid.mem.arena.block_index = block_index;
+  memid.mem.arena.block_index = (uint32_t)block_index;
+  memid.mem.arena.block_count = (uint32_t)block_count;
   memid.mem.arena.is_exclusive = is_exclusive;
   return memid;
 }
 
 // returns if the arena is exclusive
-static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
+static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index, size_t* block_count) {
   mi_assert_internal(memid.memkind == MI_MEM_ARENA);
   *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *block_index = memid.mem.arena.block_index;
+  if (block_index) *block_index = memid.mem.arena.block_index;
+  if (block_count) *block_count = memid.mem.arena.block_count;
   return memid.mem.arena.is_exclusive;
 }
 
 // get the arena and block index
-static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index) {
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index, size_t* block_count) {
   size_t arena_index;
-  mi_arena_memid_indices(memid, &arena_index, block_index);
+  mi_arena_memid_indices(memid, &arena_index, block_index, block_count);
   return mi_arena_from_index(arena_index);
 }
 
 
-static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index) {
+static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* block_count) {
   // todo: maybe store the arena* directly in the page?
-  return mi_arena_from_memid(page->memid, block_index);
+  return mi_arena_from_memid(page->memid, block_index, block_count);
 }
 
+
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
@@ -187,7 +192,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
   // claimed it!
   void* p = mi_arena_block_start(arena, block_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index, needed_bcount);
   memid->is_pinned = arena->memid.is_pinned;
 
   // set the dirty bits
@@ -424,7 +429,15 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
-
+static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
+  // todo: record real allocated size instead of trying to recalculate?
+  size_t page_size;
+  uint8_t* const pstart = mi_page_area(page, &page_size);
+  const size_t diff = pstart - (uint8_t*)page;
+  const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE);
+  if (psize != NULL) { *psize = size; }
+  return (uint8_t*)page;
+}
 
 /* -----------------------------------------------------------
   Arena page allocation
@@ -467,7 +480,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
 {
   const bool allow_large = true;
   const bool commit = true;
-  const size_t alignment = MI_ARENA_BLOCK_ALIGN;
+  const size_t alignment = 1;
 
   // try to allocate from free space in arena's
   mi_memid_t memid = _mi_memid_none();
@@ -515,13 +528,13 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size
   // 1. look for an abandoned page  
   mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld);
   if (page != NULL) {
-    _mi_page_reclaim(heap,page);
-    return page;
+    return page;  // return as abandoned
   }
 
   // 2. find a free block, potentially allocating a new arena
   page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld);
-  if (page != NULL) {    
+  if (page != NULL) {  
+    mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.block_count == block_count);
     _mi_page_init(heap, page);
     return page;
   }
@@ -559,21 +572,11 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
   return page;
 }
 
-static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
-  // todo: record real allocated size instead of trying to recalculate?
-  size_t page_size;
-  uint8_t* const pstart = mi_page_area(page, &page_size);
-  const size_t diff = pstart - (uint8_t*)page;
-  const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE);
-  if (psize != NULL) { *psize = size;  }
-  return (uint8_t*)page;
-}
+
 
 void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
-  size_t size;
-  uint8_t* pstart = mi_arena_page_allocated_area(page, &size);
   _mi_page_map_unregister(page);
-  _mi_arena_free(pstart, size, size, page->memid, &tld->stats);
+  _mi_arena_free(page, 0, 0, page->memid, &tld->stats);
 }
 
 /* -----------------------------------------------------------
@@ -595,11 +598,9 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
     // leave as is; it will be reclaimed on the first free
   }
   else if (page->memid.memkind==MI_MEM_ARENA) {
-    size_t size;
-    mi_arena_page_allocated_area(page, &size);
-    size_t bin      = _mi_bin(mi_page_block_size(page));
+    size_t bin = _mi_bin(mi_page_block_size(page));
     size_t block_index;
-    mi_arena_t* arena = mi_page_arena(page, &block_index);
+    mi_arena_t* arena = mi_page_arena(page, &block_index, NULL);
     bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL);
     MI_UNUSED(were_zero); mi_assert_internal(were_zero);
     mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]);
@@ -618,7 +619,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
 
   if mi_likely(memid.memkind == MI_MEM_ARENA) {
     size_t block_index;
-    mi_arena_t* arena = mi_page_arena(page, &block_index);
+    mi_arena_t* arena = mi_page_arena(page, &block_index, NULL);
     if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
 
     // don't reclaim more from a `free` call than half the current segments
@@ -657,7 +658,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t
 static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
 
 void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(size >= 0 && stats != NULL);
   mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
@@ -676,21 +677,19 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   }
   else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
-    size_t arena_idx;
+    size_t block_count;
     size_t block_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
+    mi_arena_t* arena = mi_arena_from_memid(memid, &block_idx, &block_count);
+    mi_assert_internal(size==1);
+    mi_assert_internal(mi_arena_block_start(arena,block_idx) <= p);
+    mi_assert_internal(mi_arena_block_start(arena,block_idx) + mi_size_of_blocks(block_count) > p);
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
     mi_assert_internal(block_idx < arena->block_count);
-    mi_assert_internal(block_idx > mi_arena_info_blocks());
+    mi_assert_internal(block_idx >= mi_arena_info_blocks());
     if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
@@ -703,7 +702,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, block_idx, block_count, NULL);
         mi_track_mem_noaccess(p, size);
         if (committed_size > 0) {
           // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
@@ -715,13 +714,13 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
+      mi_arena_schedule_purge(arena, block_idx, block_count, stats);
     }
 
     // and make it available to others again
-    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
+    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, block_count, NULL);
     if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_block_start(arena,block_idx), mi_size_of_blocks(block_count));
       return;
     };
   }
@@ -846,7 +845,11 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   mi_lock_init(&arena->abandoned_visit_lock);
-
+  mi_heap_t* heap = mi_heap_get_default();
+  if (heap != NULL) {
+    arena->subproc = heap->tld->subproc;
+  }
+  
   // init bitmaps
   mi_bitmap_init(&arena->blocks_free,true);
   mi_bitmap_init(&arena->blocks_committed,true);
@@ -925,18 +928,21 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
-    char buf[MI_BITMAP_CHUNK_BITS + 1];
+    char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
-    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+    for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
       if (bit_count < block_count) {
-        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
+        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + k);
+        k += MI_BFIELD_BITS;
+        buf[k++] = ' ';
       }
       else {
-        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
-      }
-      bit_count += MI_BFIELD_BITS;
+        _mi_memset(buf + k, ' ', MI_BFIELD_BITS);
+        k += MI_BFIELD_BITS;
+      }      
+      bit_count += MI_BFIELD_BITS;      
     }
-    buf[MI_BITMAP_CHUNK_BITS] = 0;
+    
     _mi_verbose_message("%s  %s\n", prefix, buf);
   }
   _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
@@ -954,7 +960,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
     block_total += arena->block_count;
-    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_verbose_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
     }
diff --git a/src/bitmap.c b/src/bitmap.c
index 175bc0ec..1a1bb031 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -170,7 +170,7 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
 
 // Check if a sequence of `n` bits within a chunk are all set/cleared.
 static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_xset = true;
   size_t idx = cidx % MI_BFIELD_BITS;
@@ -350,7 +350,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
     while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
       b >>= idx;
       bshift += idx;
-      if (bshift + n >= MI_BFIELD_BITS) break;
+      if (bshift + n > MI_BFIELD_BITS) break;
 
       if ((b&mask) == mask) { // found a match
         mi_assert_internal( ((mask << bshift) >> bshift) == mask );
@@ -448,7 +448,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
   n -= m;
   const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
   if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
+    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), mid_chunks * (MI_BITMAP_CHUNK_BITS/8));
     const size_t end_chunk = chunk_idx + mid_chunks;
     while (chunk_idx < end_chunk) {
       mi_bitmap_update_anyset(set, bitmap, chunk_idx);
diff --git a/src/options.c b/src/options.c
index d565e269..2eaf29a3 100644
--- a/src/options.c
+++ b/src/options.c
@@ -132,7 +132,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { -1,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
diff --git a/src/page.c b/src/page.c
index 122b4324..3f145347 100644
--- a/src/page.c
+++ b/src/page.c
@@ -274,12 +274,17 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #endif
   mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
     return NULL;
   }
-  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+  if (mi_page_is_abandoned(page)) {
+    _mi_page_reclaim(heap, page);
+  }
+  else if (pq != NULL) {
+    mi_page_queue_push(heap, pq, page);
+  }
   mi_heap_stat_increase(heap, pages, 1);
-  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
 }
diff --git a/test/test-stress.c b/test/test-stress.c
index c7288b1a..2d7557b8 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -244,7 +244,8 @@ static void test_stress(void) {
     //mi_debug_show_arenas();
     #endif
     #if !defined(NDEBUG) || defined(MI_TSAN)
-    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+    if (true) // (n + 1) % 10 == 0) 
+      { printf("- iterations left: %3d\n", ITER - (n + 1)); }
     #endif
   }
 }
@@ -276,7 +277,7 @@ int main(int argc, char** argv) {
     mi_option_enable(mi_option_visit_abandoned);
   #endif
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
   #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();

From 9d904e864395da8c493e4d6c997f1296bcc1f5e2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 10:39:30 -0800
Subject: [PATCH 009/264] wip: bug fixes

---
 ide/vs2022/mimalloc.vcxproj |  1 -
 include/mimalloc/internal.h |  9 ++++++++-
 src/arena.c                 | 33 ++++++++++++++++++---------------
 src/init.c                  |  2 +-
 src/page-map.c              | 12 ++++++------
 src/page.c                  | 10 ++++++----
 src/prim/windows/prim.c     |  9 +++++++--
 test/test-stress.c          | 12 ++++++------
 8 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 8606faf3..9e8dab78 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -120,7 +120,6 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <PostBuildEvent>
       <Command>
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 47301e79..119b7b93 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -467,6 +467,12 @@ static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
   return mi_page_start(page);
 }
 
+static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) {
+  size_t psize;
+  uint8_t* start = mi_page_area(page, &psize);
+  return (start <= p && p < start + psize);
+}
+
 static inline bool mi_page_is_in_arena(const mi_page_t* page) {
   return (page->memid.memkind == MI_MEM_ARENA);
 }
@@ -663,8 +669,9 @@ We also pass a separate `null` value to be used as `NULL` or otherwise
 ------------------------------------------------------------------- */
 
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_page_t* page = _mi_ptr_page(p);
+  return mi_page_contains_address(page,q);
   // return (_mi_ptr_page(p) == _mi_ptr_page(q));
-  return  ((uintptr_t)p / MI_LARGE_PAGE_SIZE) == ((uintptr_t)q / MI_LARGE_PAGE_SIZE);
 }
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
diff --git a/src/arena.c b/src/arena.c
index c5d8b14a..632c7a2a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -316,7 +316,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
     if (_idx >= _max_arena) { _idx -= _max_arena; } \
     const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\
     mi_arena_t* const   var_arena = mi_arena_from_index(_idx); \
-    if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
+    if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
     {
 
 #define mi_forall_arenas_end()  }}} 
@@ -576,7 +576,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
 
 void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
   _mi_page_map_unregister(page);
-  _mi_arena_free(page, 0, 0, page->memid, &tld->stats);
+  _mi_arena_free(page, 1, 1, page->memid, &tld->stats);
 }
 
 /* -----------------------------------------------------------
@@ -590,14 +590,8 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
   if (mi_page_all_free(page)) {
     _mi_arena_page_free(page, tld);
   }
-  else if (mi_page_is_full(page)) {  // includes singleton pages
-    // leave as is; it will be reclaimed on free
-  }
-  else if (mi_memkind_is_os(page->memid.memkind)) {
-    _mi_error_message(EINVAL, "implement page abandon for OS allocated pages\n");
-    // leave as is; it will be reclaimed on the first free
-  }
   else if (page->memid.memkind==MI_MEM_ARENA) {
+    // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
     size_t block_index;
     mi_arena_t* arena = mi_page_arena(page, &block_index, NULL);
@@ -606,14 +600,14 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
     mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]);
   }
   else {
-    _mi_error_message(EINVAL, "implement page abandon for external allocated pages\n");
-    // leave as is; it will be reclaimed on the first free
+    // page is full (or a singleton), page is OS/externally allocated
+    // leave as is; it will be reclaimed when an object is free'd in the page
   }
 }
 
 bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
-  mi_assert_internal(mi_page_is_abandoned(page));
-  // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); }
+  if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned
   mi_memid_t memid = page->memid;
   if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
 
@@ -637,7 +631,16 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
     }
   }
   else {
-    _mi_warning_message("implement reclaim for OS allocated pages\n");
+    // A page in OS or external memory
+    // we use the thread_id to atomically grab ownership
+    // TODO: respect the subproc -- do we need to add this to the page?
+    mi_threadid_t abandoned_thread_id = 0;
+    if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) {
+      // we unabandoned partly
+      _mi_page_reclaim(heap, page);
+      mi_assert_internal(!mi_page_is_abandoned(page));
+      return true;
+    }
   }
 
 
@@ -1193,7 +1196,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
 
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
   MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
-  _mi_error_message(EINVAL, "implement mi_abandon_visit_blocks\n");
+  _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n");
   return false;
 }
 
diff --git a/src/init.c b/src/init.c
index d11f5b5a..40bc5c4a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -396,7 +396,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
diff --git a/src/page-map.c b/src/page-map.c
index 8dfd2f26..e803a367 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 mi_decl_cache_align signed char* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
-static size_t      mi_size_per_commit_bit = MI_ARENA_BLOCK_SIZE;
+static size_t      mi_page_map_size_per_commit_bit = MI_ARENA_BLOCK_SIZE;
 static mi_memid_t  mi_page_map_memid;
 static mi_bitmap_t mi_page_map_commit;
 
@@ -22,7 +22,7 @@ static bool mi_page_map_init(void) {
   //                    64 KiB for 4 GiB address space (on 32-bit)
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT));
   
-  mi_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
+  mi_page_map_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
 
   mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
   _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
@@ -45,12 +45,12 @@ static bool mi_page_map_init(void) {
 static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) {
   // is the page map area that contains the page address committed?
   if (!mi_page_map_all_committed) {
-    const size_t commit_bit_count = _mi_divide_up(block_count, mi_size_per_commit_bit);
-    const size_t commit_bit_idx = idx / mi_size_per_commit_bit;
+    const size_t commit_bit_count = _mi_divide_up(block_count, mi_page_map_size_per_commit_bit);
+    const size_t commit_bit_idx = idx / mi_page_map_size_per_commit_bit;
     for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
       if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
-        _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_size_per_commit_bit), mi_size_per_commit_bit, NULL, NULL);
+        _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_page_map_size_per_commit_bit), mi_page_map_size_per_commit_bit, NULL, NULL);
         mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
       }
     }
@@ -100,7 +100,7 @@ void _mi_page_map_unregister(mi_page_t* page) {
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT);
-  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_size_per_commit_bit, 1)) {
+  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_size_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
   else {
diff --git a/src/page.c b/src/page.c
index 3f145347..b6af4fd0 100644
--- a/src/page.c
+++ b/src/page.c
@@ -713,7 +713,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
 -------------------------------------------------------------*/
 
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (8)
+#define MI_MAX_CANDIDATE_SEARCH  (0)
 
 
 // Find a page with free blocks of `page->block_size`.
@@ -788,9 +788,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   if (page_candidate != NULL) {
     page = page_candidate;
   }
-  if (page != NULL && !mi_page_immediate_available(page)) {
-    mi_assert_internal(mi_page_is_expandable(page));
-    mi_page_extend_free(heap, page);
+  if (page != NULL) {
+    if (!mi_page_immediate_available(page)) {
+      mi_assert_internal(mi_page_is_expandable(page));
+      mi_page_extend_free(heap, page);
+    }
   }
 
   if (page == NULL) {
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 418c950f..276da85c 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -108,6 +108,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 // Initialize
 //---------------------------------------------
 
+static DWORD win_allocation_granularity = 64*MI_KiB;
+
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
   config->has_overcommit = false;
@@ -117,7 +119,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   SYSTEM_INFO si;
   GetSystemInfo(&si);
   if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  if (si.dwAllocationGranularity > 0) { 
+    config->alloc_granularity = si.dwAllocationGranularity; 
+    win_allocation_granularity = si.dwAllocationGranularity;
+  }
   // get virtual address bits
   if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
     const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
@@ -203,7 +208,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
   }
   #endif
   // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
     reqs.Alignment = try_alignment;
     MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
diff --git a/test/test-stress.c b/test/test-stress.c
index 2d7557b8..e287cfa7 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,10 +40,10 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 1
-static int THREADS = 1;
-static int SCALE   = 10;
-static int ITER    = 10;
+#elif 0
+static int THREADS = 4;
+static int SCALE   = 20;
+static int ITER    = 20;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 25;      // scaling factor
@@ -69,7 +69,7 @@ static bool   main_participates = false;       // main thread participates as a
 #define custom_realloc(p,s)   mi_realloc(p,s)
 #define custom_free(p)        mi_free(p)
 #ifndef NDEBUG
-#define HEAP_WALK             // walk the heap objects?
+#define xHEAP_WALK             // walk the heap objects?
 #endif
 #endif
 
@@ -323,7 +323,7 @@ int main(int argc, char** argv) {
   mi_debug_show_arenas(true,true,true);
   mi_collect(true);
   #endif
-  mi_stats_print(NULL);
+  // mi_stats_print(NULL);
 #endif
   //bench_end_program();
   return 0;

From 188294a0dfd26493f64e6049f438de715969e6cb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 11:12:39 -0800
Subject: [PATCH 010/264] wip: bug fixes

---
 include/mimalloc/internal.h |  1 +
 src/arena.c                 | 39 +++++++++++++++++--------------------
 src/options.c               |  7 +++++++
 src/page-map.c              | 23 +++++++++++-----------
 src/page-queue.c            |  2 +-
 src/page.c                  |  4 ++--
 src/stats.c                 |  4 ++--
 test/test-stress.c          | 11 ++++++-----
 8 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 119b7b93..d4ec8bb7 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -63,6 +63,7 @@ void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
 void       _mi_warning_message(const char* fmt, ...);
 void       _mi_verbose_message(const char* fmt, ...);
 void       _mi_trace_message(const char* fmt, ...);
+void       _mi_output_message(const char* fmt, ...);
 void       _mi_options_init(void);
 long       _mi_option_get_fast(mi_option_t option);
 void       _mi_error_message(int err, const char* fmt, ...);
diff --git a/src/arena.c b/src/arena.c
index 632c7a2a..424a9c70 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -429,15 +429,7 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
-static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
-  // todo: record real allocated size instead of trying to recalculate?
-  size_t page_size;
-  uint8_t* const pstart = mi_page_area(page, &page_size);
-  const size_t diff = pstart - (uint8_t*)page;
-  const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE);
-  if (psize != NULL) { *psize = size; }
-  return (uint8_t*)page;
-}
+
 
 /* -----------------------------------------------------------
   Arena page allocation
@@ -445,6 +437,7 @@ static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) {
 
 static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
+  MI_UNUSED(block_count);
   const size_t bin = _mi_bin(block_size); 
   mi_assert_internal(bin < MI_BIN_COUNT);
 
@@ -693,7 +686,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     }
     mi_assert_internal(block_idx < arena->block_count);
     mi_assert_internal(block_idx >= mi_arena_info_blocks());
-    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
+    if (block_idx < mi_arena_info_blocks() || block_idx > arena->block_count) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
@@ -926,8 +919,8 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap, bool invert) {
+  _mi_output_message("%s%s:\n", prefix, header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
@@ -935,7 +928,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
     for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
       if (bit_count < block_count) {
-        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + k);
+        mi_bfield_t bfield = chunk->bfields[j];
+        if (invert) bfield = ~bfield;
+        size_t xcount = mi_debug_show_bfield(bfield, buf + k);
+        if (invert) xcount = MI_BFIELD_BITS - xcount;
+        bit_set_count += xcount;
         k += MI_BFIELD_BITS;
         buf[k++] = ' ';
       }
@@ -946,9 +943,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
       bit_count += MI_BFIELD_BITS;      
     }
     
-    _mi_verbose_message("%s  %s\n", prefix, buf);
+    _mi_output_message("%s  %s\n", prefix, buf);
   }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
   return bit_set_count;
 }
 
@@ -963,19 +960,19 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
     block_total += arena->block_count;
-    _mi_verbose_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_output_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
+      free_total += mi_debug_show_bitmap("  ", "in-use blocks", arena->block_count, &arena->blocks_free, true);
     }
-    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);
+    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed, false);
     // todo: abandoned blocks
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge, false);
     }
   }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
+  if (show_inuse)     _mi_output_message("total inuse blocks    : %zu\n", block_total - free_total);
   // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+  if (show_purge)     _mi_output_message("total purgeable blocks: %zu\n", purge_total);
 }
 
 
diff --git a/src/options.c b/src/options.c
index 2eaf29a3..8cb0d216 100644
--- a/src/options.c
+++ b/src/options.c
@@ -438,6 +438,13 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
   }
 }
 
+void _mi_output_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, NULL, NULL, fmt, args);
+  va_end(args);
+}
+
 void _mi_trace_message(const char* fmt, ...) {
   if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
   va_list args;
diff --git a/src/page-map.c b/src/page-map.c
index e803a367..f52fab10 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 mi_decl_cache_align signed char* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
-static size_t      mi_page_map_size_per_commit_bit = MI_ARENA_BLOCK_SIZE;
+static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_BLOCK_SIZE;
 static mi_memid_t  mi_page_map_memid;
 static mi_bitmap_t mi_page_map_commit;
 
@@ -22,7 +22,7 @@ static bool mi_page_map_init(void) {
   //                    64 KiB for 4 GiB address space (on 32-bit)
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT));
   
-  mi_page_map_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
+  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
 
   mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
   _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
@@ -42,16 +42,16 @@ static bool mi_page_map_init(void) {
   return true;
 }
 
-static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) {
+static void mi_page_map_ensure_committed(size_t idx, size_t block_count) {
   // is the page map area that contains the page address committed?
   if (!mi_page_map_all_committed) {
-    const size_t commit_bit_count = _mi_divide_up(block_count, mi_page_map_size_per_commit_bit);
-    const size_t commit_bit_idx = idx / mi_page_map_size_per_commit_bit;
-    for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
-      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
+    const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
+    const size_t commit_bit_idx_hi = (idx + block_count - 1) / mi_page_map_entries_per_commit_bit;
+    for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
-        _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_page_map_size_per_commit_bit), mi_page_map_size_per_commit_bit, NULL, NULL);
-        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
+        _mi_os_commit(_mi_page_map + (i*mi_page_map_entries_per_commit_bit), mi_page_map_entries_per_commit_bit, NULL, NULL);
+        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL);
       }
     }
   }
@@ -71,11 +71,12 @@ void _mi_page_map_register(mi_page_t* page) {
   if mi_unlikely(_mi_page_map == NULL) {
     if (!mi_page_map_init()) return;
   }
+  mi_assert(_mi_page_map!=NULL);
   uint8_t* page_start;
   size_t   block_count;
   const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
   
-  mi_page_map_ensure_committed(page_start, idx, block_count);
+  mi_page_map_ensure_committed(idx, block_count);
 
   // set the offsets
   for (int i = 0; i < (int)block_count; i++) {
@@ -100,7 +101,7 @@ void _mi_page_map_unregister(mi_page_t* page) {
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT);
-  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_size_per_commit_bit, 1)) {
+  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
   else {
diff --git a/src/page-queue.c b/src/page-queue.c
index c6b19985..3fcd700d 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -112,7 +112,7 @@ size_t _mi_bin_size(uint8_t bin) {
 }
 
 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
   if (size <= MI_LARGE_MAX_OBJ_SIZE) {
     return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
diff --git a/src/page.c b/src/page.c
index b6af4fd0..f8ef641e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -638,7 +638,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
-  size_t extend = page->reserved - page->capacity;
+  size_t extend = (size_t)page->reserved - page->capacity;
   mi_assert_internal(extend > 0);
 
   size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
@@ -672,7 +672,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
   mi_page_set_heap(page, heap);
   size_t page_size;
-  uint8_t* page_start = mi_page_area(page, &page_size);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);
   mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
   mi_assert_internal(page->reserved > 0);
diff --git a/src/stats.c b/src/stats.c
index 14489937..9f7a3cf0 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -133,7 +133,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;
+  char buf[32]; _mi_memzero_var(buf);
   int  len = 32;
   const char* suffix = (unit <= 0 ? " " : "B");
   const int64_t base = (unit == 0 ? 1000 : 1024);
@@ -298,7 +298,7 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
 
 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
   // wrap the output function to be line buffered
-  char buf[256];
+  char buf[256]; _mi_memzero_var(buf);
   buffered_t buffer = { out0, arg0, NULL, 0, 255 };
   buffer.buf = buf;
   mi_output_fun* out = &mi_buffered_out;
diff --git a/test/test-stress.c b/test/test-stress.c
index e287cfa7..6327e995 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -48,13 +48,13 @@ static int ITER    = 20;
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 25;      // scaling factor
 static int ITER    = 50;      // N full iterations destructing and re-creating all threads
-#endif
+#endif  
 
 
 
 #define STRESS                // undefine for leak test
 
-static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
+static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
 static bool   main_participates = false;       // main thread participates as a worker too
@@ -244,7 +244,7 @@ static void test_stress(void) {
     //mi_debug_show_arenas();
     #endif
     #if !defined(NDEBUG) || defined(MI_TSAN)
-    if (true) // (n + 1) % 10 == 0) 
+    if ((n + 1) % 10 == 0) 
       { printf("- iterations left: %3d\n", ITER - (n + 1)); }
     #endif
   }
@@ -320,7 +320,7 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  mi_debug_show_arenas(true,true,true);
+  mi_debug_show_arenas(true,true,false);
   mi_collect(true);
   #endif
   // mi_stats_print(NULL);
@@ -345,9 +345,10 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
   thread_entry_fun = fun;
   DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
   HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
+  thandles[0] = GetCurrentThread(); // avoid lint warning
   const size_t start = (main_participates ? 1 : 0);
   for (size_t i = start; i < nthreads; i++) {
-    thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
+    thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]);
   }
   if (main_participates) fun(0); // run the main thread as well
   for (size_t i = start; i < nthreads; i++) {

From 309fc26b4b4d983b86f65fe4a56375c641aa2f09 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 12:00:30 -0800
Subject: [PATCH 011/264] wip: add generic find_and_xset

---
 ide/vs2022/mimalloc.vcxproj |  2 ++
 src/bitmap.c                | 62 +++++++++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 9e8dab78..d03fd281 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -120,6 +120,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <PostBuildEvent>
       <Command>
@@ -180,6 +181,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
diff --git a/src/bitmap.c b/src/bitmap.c
index 1a1bb031..bb54af6b 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -37,6 +37,13 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
   return mi_rotr(x,r);
 }
 
+// Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
+// return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, size_t* idx) {
+  return mi_bfield_find_least_bit((set ? ~x : x), idx);
+}
+
 // Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
 static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
@@ -190,7 +197,8 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz
   return all_xset;
 }
 
-// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
+// Try to atomically set/clear a sequence of `n` bits within a chunk. 
+// Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
 static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
   mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
@@ -251,6 +259,54 @@ restore:
 }
 
 
+// find least 0/1-bit in a chunk and try to set/clear it atomically
+// set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
+#if 0 && defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while (true) {
+    const __m256i vec   = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp  = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;            
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    size_t cidx;
+    if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
+      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
+        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+    // try again
+  }
+#else
+  for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    size_t idx;
+    if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit
+      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) {  // try to set it atomically
+        *pidx = (i*MI_BFIELD_BITS + idx);
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+  }
+  return false;
+#endif
+}
+
+static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
+}
+
+static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
+}
+
+/*
 // find least 1-bit in a chunk and try unset it atomically
 // set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
 // todo: try neon version
@@ -288,7 +344,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk,
   return false;
   #endif
 }
-
+*/
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
@@ -613,7 +669,7 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
   // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
   // TODO: allow spanning across chunk boundaries
   if (n == 0 || n > MI_BFIELD_BITS) return false;

From d15e83030ea5f613f75bb3c1c1b380fe1e847467 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 12:16:41 -0800
Subject: [PATCH 012/264] wip: rename arena blocks to slices

---
 include/mimalloc/internal.h |  18 +-
 include/mimalloc/types.h    |  30 ++--
 src/arena-abandon.c         |   2 +-
 src/arena-old.c             |  22 +--
 src/arena.c                 | 330 ++++++++++++++++++------------------
 src/page-map.c              |  38 ++---
 src/stats.c                 |   4 +-
 test/test-stress.c          |   2 +-
 8 files changed, 222 insertions(+), 224 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index d4ec8bb7..082882bb 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -443,16 +443,16 @@ extern signed char* _mi_page_map;
 #define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_BLOCK_SHIFT;
+  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
   const ptrdiff_t ofs = _mi_page_map[up];
   #if MI_DEBUG
   if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
   #endif
-  return (mi_page_t*)((up + ofs + 1) << MI_ARENA_BLOCK_SHIFT);
+  return (mi_page_t*)((up + ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }
 
 
-// Get the block size of a page 
+// Get the block size of a page
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
   return page->block_size;
@@ -509,8 +509,8 @@ static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
-  if (heap != NULL) { 
-    page->heap_tag = heap->tag; 
+  if (heap != NULL) {
+    page->heap_tag = heap->tag;
     mi_atomic_store_release(&page->xthread_id, heap->thread_id);
   }
   else {
@@ -749,13 +749,13 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
 ----------------------------------------------------------- */
 
 // Blocks needed for a given byte size
-static inline size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+static inline size_t mi_slice_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
 }
 
 // Byte size of a number of blocks
-static inline size_t mi_size_of_blocks(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
+static inline size_t mi_size_of_slices(size_t bcount) {
+  return (bcount * MI_ARENA_SLICE_SIZE);
 }
 
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e3c0786c..ac0a5fc4 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -112,26 +112,26 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 
 // Sizes are for 64-bit
-#ifndef MI_ARENA_BLOCK_SHIFT
+#ifndef MI_ARENA_SLICE_SHIFT
 #ifdef  MI_SMALL_PAGE_SHIFT  // compatibility
-#define MI_ARENA_BLOCK_SHIFT              MI_SMALL_PAGE_SHIFT
+#define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
-#define MI_ARENA_BLOCK_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
+#define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
 #endif
 #ifndef MI_BITMAP_CHUNK_BITS_SHIFT
 #define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
 #endif
 
-#define MI_ARENA_BLOCK_SIZE               (MI_ZU(1) << MI_ARENA_BLOCK_SHIFT)   
-#define MI_ARENA_BLOCK_ALIGN              (MI_ARENA_BLOCK_SIZE)
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 #define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_ARENA_MIN_OBJ_BLOCKS           (1) 
+#define MI_ARENA_MIN_OBJ_BLOCKS           (1)
 #define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
 
-#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)
-#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)  
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
 
 #define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
@@ -145,7 +145,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_BLOCK_ALIGN)
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_SLICE_ALIGN)
 
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
@@ -162,8 +162,8 @@ typedef enum mi_memkind_e {
   MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
   MI_MEM_OS,        // allocated from the OS
   MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`) 
-  MI_MEM_ARENA      // allocated from an arena (the usual case) 
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case)
 } mi_memkind_t;
 
 static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
@@ -176,8 +176,8 @@ typedef struct mi_memid_os_info {
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
-  uint32_t      block_index;        // base index in the arena
-  uint32_t      block_count;        // allocated blocks
+  uint32_t      slice_index;        // base index in the arena
+  uint32_t      slice_count;        // allocated slices
   mi_arena_id_t id;                 // arena id (>= 1)
   bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
@@ -295,7 +295,7 @@ typedef struct mi_page_s {
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
-  size_t                block_size;        // size available in each block (always `>0`)  
+  size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the blocks
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
@@ -340,7 +340,7 @@ typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
   MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
-  MI_PAGE_SINGLETON // page containing a single block. 
+  MI_PAGE_SINGLETON // page containing a single block.
                     // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
diff --git a/src/arena-abandon.c b/src/arena-abandon.c
index 48e37794..14712886 100644
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@@ -344,7 +344,7 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi
     _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
     return false;
   }
-  mi_arena_field_cursor_t current;
+  mi_arena_field_cursor_t current;0
   _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
   mi_segment_t* segment;
   bool ok = true;
diff --git a/src/arena-old.c b/src/arena-old.c
index 8ca5aaf3..3f41e9c7 100644
--- a/src/arena-old.c
+++ b/src/arena-old.c
@@ -34,7 +34,7 @@ typedef struct mi_arena_s {
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
   _Atomic(uint8_t*)start;                // the start of the memory area
-  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_SLICE_SIZE`)
   size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
   size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
   mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
@@ -53,8 +53,8 @@ typedef struct mi_arena_s {
 } mi_arena_t;
 
 
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_ARENA_SLICE_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_SLICE_SIZE/2)  // 32MiB
 #define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
 
 // The available arenas
@@ -113,11 +113,11 @@ mi_arena_t* mi_arena_from_index(size_t idx) {
 ----------------------------------------------------------- */
 
 static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+  return _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
 }
 
 static size_t mi_arena_block_size(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
+  return (bcount * MI_ARENA_SLICE_SIZE);
 }
 
 static size_t mi_arena_size(mi_arena_t* arena) {
@@ -363,7 +363,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
   arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
     // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
@@ -429,7 +429,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
 
@@ -774,13 +774,13 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  if (size < MI_ARENA_BLOCK_SIZE) return false;
+  if (size < MI_ARENA_SLICE_SIZE) return false;
 
   if (is_large) {
     mi_assert_internal(memid.initially_committed && memid.is_pinned);
   }
 
-  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
+  const size_t bcount = size / MI_ARENA_SLICE_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
   const size_t bitmaps = (memid.is_pinned ? 3 : 5);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
@@ -836,7 +836,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
 // Reserve a range of regular OS memory
 int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one block
   mi_memid_t memid;
   void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
   if (start == NULL) return ENOMEM;
@@ -898,7 +898,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
-    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_SLICE_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
       inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
     }
diff --git a/src/arena.c b/src/arena.c
index 424a9c70..7b5256b6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -36,19 +36,19 @@ typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
 
-  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              slice_count;          // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices should be decommitted from `slices_decommit`.
   mi_subproc_t*       subproc;
 
-  mi_bitmap_t         blocks_free;          // is the block free?
-  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
-  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
-  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
-  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+  mi_bitmap_t         slices_free;          // is the slice free?
+  mi_bitmap_t         slices_committed;     // is the slice committed? (i.e. accessible)
+  mi_bitmap_t         slices_purge;         // can the slice be purged? (slice in purge => slice in free)
+  mi_bitmap_t         slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t         slices_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
 } mi_arena_t;
 
@@ -112,14 +112,14 @@ mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
 
 // Size of an arena
 static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
+  return mi_size_of_slices(arena->slice_count);
 }
 
-static size_t mi_arena_info_blocks(void) {
+static size_t mi_arena_info_slices(void) {
   const size_t os_page_size = _mi_os_page_size();
   const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
-  const size_t info_blocks  = mi_block_count_of_size(info_size);
-  return info_blocks;
+  const size_t info_slices  = mi_slice_count_of_size(info_size);
+  return info_slices;
 }
 
 
@@ -128,9 +128,9 @@ static uint8_t* mi_arena_start(mi_arena_t* arena) {
   return ((uint8_t*)arena);
 }
 
-// Start of a block
-uint8_t* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
-  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
+// Start of a slice
+uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) {
+  return (mi_arena_start(arena) + mi_size_of_slices(slice_index));
 }
 
 // Arena area
@@ -140,43 +140,43 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (arena_index >= MI_MAX_ARENAS) return NULL;
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  if (size != NULL) { *size = mi_size_of_slices(arena->slice_count); }
   return mi_arena_start(arena);
 }
 
 
 // Create an arena memid
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index, size_t block_count) {
-  mi_assert_internal(block_index < UINT32_MAX); 
-  mi_assert_internal(block_count < UINT32_MAX);
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t slice_index, size_t slice_count) {
+  mi_assert_internal(slice_index < UINT32_MAX);
+  mi_assert_internal(slice_count < UINT32_MAX);
   mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
   memid.mem.arena.id = id;
-  memid.mem.arena.block_index = (uint32_t)block_index;
-  memid.mem.arena.block_count = (uint32_t)block_count;
+  memid.mem.arena.slice_index = (uint32_t)slice_index;
+  memid.mem.arena.slice_count = (uint32_t)slice_count;
   memid.mem.arena.is_exclusive = is_exclusive;
   return memid;
 }
 
 // returns if the arena is exclusive
-static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index, size_t* block_count) {
+static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* slice_index, size_t* slice_count) {
   mi_assert_internal(memid.memkind == MI_MEM_ARENA);
   *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  if (block_index) *block_index = memid.mem.arena.block_index;
-  if (block_count) *block_count = memid.mem.arena.block_count;
+  if (slice_index) *slice_index = memid.mem.arena.slice_index;
+  if (slice_count) *slice_count = memid.mem.arena.slice_count;
   return memid.mem.arena.is_exclusive;
 }
 
-// get the arena and block index
-static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index, size_t* block_count) {
+// get the arena and slice index
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) {
   size_t arena_index;
-  mi_arena_memid_indices(memid, &arena_index, block_index, block_count);
+  mi_arena_memid_indices(memid, &arena_index, slice_index, slice_count);
   return mi_arena_from_index(arena_index);
 }
 
 
-static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* block_count) {
+static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* slice_count) {
   // todo: maybe store the arena* directly in the page?
-  return mi_arena_from_memid(page->memid, block_index, block_count);
+  return mi_arena_from_memid(page->memid, slice_index, slice_count);
 }
 
 
@@ -185,19 +185,19 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* b
 ----------------------------------------------------------- */
 
 static mi_decl_noinline void* mi_arena_try_alloc_at(
-  mi_arena_t* arena, size_t needed_bcount, bool commit, size_t tseq, mi_memid_t* memid)
-{  
-  size_t block_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, needed_bcount, tseq, &block_index)) return NULL;
+  mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
+{
+  size_t slice_index;
+  if (!mi_bitmap_try_find_and_clearN(&arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
 
   // claimed it!
-  void* p = mi_arena_block_start(arena, block_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index, needed_bcount);
+  void* p = mi_arena_slice_start(arena, slice_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
 
   // set the dirty bits
   if (arena->memid.initially_zero) {
-    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
+    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count, NULL);
   }
 
   // set commit state
@@ -206,10 +206,10 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     memid->initially_committed = true;
 
     bool all_already_committed;
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count, &all_already_committed);
     if (!all_already_committed) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, NULL)) {
+      if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) {
         memid->initially_committed = false;
       }
       else {
@@ -219,13 +219,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   }
   else {
     // no need to commit, but check if already fully committed
-    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
+    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count);
   }
 
-  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, needed_bcount));
-  if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount)); }
-  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount));
-  // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, needed_bcount));
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count));
+  if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); }
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count));
+  // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count));
 
   return p;
 }
@@ -247,7 +247,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
   if (arena_count >= 8 && arena_count <= 128) {
     // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
@@ -259,8 +259,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
-  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
+  const size_t min_reserve = mi_size_of_slices(mi_arena_info_slices() + 1);
+  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_SLICE_SIZE;
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
   }
@@ -319,55 +319,55 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
     if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
     {
 
-#define mi_forall_arenas_end()  }}} 
+#define mi_forall_arenas_end()  }}}
 
 
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
 
-// allocate blocks from the arenas
+// allocate slices from the arenas
 static mi_decl_noinline void* mi_arena_try_find_free(
-  size_t block_count, size_t alignment,
+  size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert_internal(block_count <= mi_block_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+  mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
   // search arena's
   mi_subproc_t* const subproc = tld->subproc;
   const size_t tseq = tld->tseq;
   mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
   {
-    void* p = mi_arena_try_alloc_at(arena, block_count, commit, tseq, memid);
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
     if (p != NULL) return p;
   }
   mi_forall_arenas_end();
   return NULL;
 }
 
-// Allocate blocks from the arena's -- potentially allocating a fresh arena
+// Allocate slices from the arena's -- potentially allocating a fresh arena
 static mi_decl_noinline void* mi_arena_try_alloc(
-  size_t block_count, size_t alignment,
+  size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS);
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_BLOCKS);
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
 
-  // try to find free blocks in the arena's
-  void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+  // try to find free slices in the arena's
+  void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
   if (p != NULL) return p;
 
   // otherwise, try to first eagerly reserve a new arena
   if (req_arena_id == _mi_arena_id_none()) {
     mi_arena_id_t arena_id = 0;
-    if (mi_arena_reserve(mi_size_of_blocks(block_count), allow_large, req_arena_id, &arena_id)) {
+    if (mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id)) {
       // and try allocate in there
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
-      p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+      p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
   }
@@ -412,10 +412,10 @@ void* _mi_arena_alloc_aligned(
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
       req_arena_id == _mi_arena_id_none() &&                   // not a specific arena?
       size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
-      alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0)            // and good alignment
+      alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
-    const size_t block_count = mi_block_count_of_size(size);
-    void* p = mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+    const size_t slice_count = mi_slice_count_of_size(size);
+    void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
     if (p != NULL) return p;
   }
 
@@ -426,7 +426,7 @@ void* _mi_arena_alloc_aligned(
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
 
@@ -435,10 +435,10 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
-  MI_UNUSED(block_count);
-  const size_t bin = _mi_bin(block_size); 
+  MI_UNUSED(slice_count);
+  const size_t bin = _mi_bin(block_size);
   mi_assert_internal(bin < MI_BIN_COUNT);
 
   // any abandoned in our size class?
@@ -450,15 +450,15 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl
   size_t tseq = tld->tseq;
   mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
   {
-    size_t block_index;
-    if (mi_bitmap_try_find_and_clear(&arena->blocks_abandoned[bin], tseq, &block_index)) {
+    size_t slice_index;
+    if (mi_bitmap_try_find_and_clear(&arena->slices_abandoned[bin], tseq, &slice_index)) {
       // found an abandoned page of the right size
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-      mi_page_t* page = (mi_page_t*)mi_arena_block_start(arena, block_index);
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, block_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, block_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, block_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, block_count));
+      mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count));
       mi_assert_internal(mi_page_block_size(page) == block_size);
       mi_assert_internal(!mi_page_is_full(page));
       mi_assert_internal(mi_page_is_abandoned(page));
@@ -469,7 +469,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl
   return NULL;
 }
 
-static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
   const bool allow_large = true;
   const bool commit = true;
@@ -479,20 +479,20 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
   mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
   if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
-    page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
+    page = (mi_page_t*)mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
   }
 
   // otherwise fall back to the OS
   if (page == NULL) {
-    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_blocks(block_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
   }
 
   if (page == NULL) return NULL;
 
-  // claimed free blocks: initialize the page partly
+  // claimed free slices: initialize the page partly
   _mi_memzero_aligned(page, sizeof(*page));
   mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
-  const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size;
+  const size_t reserved = (mi_size_of_slices(slice_count) - MI_PAGE_INFO_SIZE) / block_size;
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
@@ -512,22 +512,20 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz
   return page;
 }
 
-// block_count: arena block count for the page
-// block size : page block size
-static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) {
+static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) {
   const mi_arena_id_t  req_arena_id = heap->arena_id;
   mi_tld_t* const tld = heap->tld;
 
-  // 1. look for an abandoned page  
-  mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld);
+  // 1. look for an abandoned page
+  mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, tld);
   if (page != NULL) {
     return page;  // return as abandoned
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld);
-  if (page != NULL) {  
-    mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.block_count == block_count);
+  page = mi_arena_page_alloc_fresh(slice_count, block_size, req_arena_id, tld);
+  if (page != NULL) {
+    mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
     return page;
   }
@@ -550,17 +548,17 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
     page = mi_singleton_page_alloc(heap, block_size, page_alignment);
   }
   else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
   }
   else {
     page = mi_singleton_page_alloc(heap, block_size, page_alignment);
-  }  
+  }
   // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
 }
@@ -579,16 +577,16 @@ void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
 void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(page->next==NULL);
-  
+
   if (mi_page_all_free(page)) {
     _mi_arena_page_free(page, tld);
   }
   else if (page->memid.memkind==MI_MEM_ARENA) {
     // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
-    size_t block_index;
-    mi_arena_t* arena = mi_page_arena(page, &block_index, NULL);
-    bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL);
+    size_t slice_index;
+    mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL);
+    bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_abandoned[bin], slice_index, 1, NULL);
     MI_UNUSED(were_zero); mi_assert_internal(were_zero);
     mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]);
   }
@@ -605,8 +603,8 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
 
   if mi_likely(memid.memkind == MI_MEM_ARENA) {
-    size_t block_index;
-    mi_arena_t* arena = mi_page_arena(page, &block_index, NULL);
+    size_t slice_index;
+    mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL);
     if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
 
     // don't reclaim more from a `free` call than half the current segments
@@ -616,7 +614,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
     //  return false;
     // }
     const size_t bin = _mi_bin(page->block_size);
-    if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->blocks_abandoned[bin], block_index, 1)) {
+    if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->slices_abandoned[bin], slice_index, 1)) {
       // we got it atomically
       _mi_page_reclaim(heap, page);
       mi_assert_internal(!mi_page_is_abandoned(page));
@@ -650,7 +648,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats);
 static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
 
 void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
@@ -673,20 +671,20 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   }
   else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
-    size_t block_count;
-    size_t block_idx;
-    mi_arena_t* arena = mi_arena_from_memid(memid, &block_idx, &block_count);
+    size_t slice_count;
+    size_t slice_index;
+    mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count);
     mi_assert_internal(size==1);
-    mi_assert_internal(mi_arena_block_start(arena,block_idx) <= p);
-    mi_assert_internal(mi_arena_block_start(arena,block_idx) + mi_size_of_blocks(block_count) > p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > p);
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    mi_assert_internal(block_idx < arena->block_count);
-    mi_assert_internal(block_idx >= mi_arena_info_blocks());
-    if (block_idx < mi_arena_info_blocks() || block_idx > arena->block_count) {
+    mi_assert_internal(slice_index < arena->slice_count);
+    mi_assert_internal(slice_index >= mi_arena_info_slices());
+    if (slice_index < mi_arena_info_slices() || slice_index > arena->slice_count) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
@@ -698,7 +696,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, block_idx, block_count, NULL);
+        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slice_index, slice_count, NULL);
         mi_track_mem_noaccess(p, size);
         if (committed_size > 0) {
           // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
@@ -710,13 +708,13 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, block_idx, block_count, stats);
+      mi_arena_schedule_purge(arena, slice_index, slice_count, stats);
     }
 
     // and make it available to others again
-    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, block_count, NULL);
+    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_free, slice_index, slice_count, NULL);
     if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_block_start(arena,block_idx), mi_size_of_blocks(block_count));
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
       return;
     };
   }
@@ -767,7 +765,7 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
       return true;
     }
   }
@@ -781,7 +779,7 @@ bool _mi_arena_contains(const void* p) {
 
 static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
   mi_assert_internal(arena != NULL);
-  mi_assert_internal(arena->block_count > 0);
+  mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = -1; }
 
   size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
@@ -799,26 +797,26 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
-  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
+  mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
   mi_assert(start!=NULL);
   if (start==NULL) return false;
-  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
-    // todo: use alignment in memid to align to blocksize first?
-    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
+  if (!_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)) {
+    // todo: use alignment in memid to align to slice size first?
+    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_SLICE_SIZE/MI_KiB, start);
     return false;
   }
 
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
 
-  const size_t info_blocks = mi_arena_info_blocks();
-  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
-  if (bcount < info_blocks+1) {
-    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
+  const size_t info_slices = mi_arena_info_slices();
+  const size_t bcount      = size / MI_ARENA_SLICE_SIZE;  // divide down
+  if (bcount < info_slices+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB);
     return false;
   }
   if (bcount > MI_BITMAP_MAX_BITS) {
     // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
-    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BITS)/MI_MiB);
     return false;
   }
   mi_arena_t* arena = (mi_arena_t*)start;
@@ -826,17 +824,17 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   // commit & zero if needed
   bool is_zero = memid.initially_zero;
   if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
+    _mi_os_commit(arena, mi_size_of_slices(info_slices), &is_zero, &_mi_stats_main);
   }
   if (!is_zero) {
-    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
+    _mi_memzero(arena, mi_size_of_slices(info_slices));
   }
 
   // init
   arena->id           = _mi_arena_id_none();
   arena->memid        = memid;
   arena->exclusive    = exclusive;
-  arena->block_count  = bcount;
+  arena->slice_count  = bcount;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
@@ -845,25 +843,25 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   if (heap != NULL) {
     arena->subproc = heap->tld->subproc;
   }
-  
+
   // init bitmaps
-  mi_bitmap_init(&arena->blocks_free,true);
-  mi_bitmap_init(&arena->blocks_committed,true);
-  mi_bitmap_init(&arena->blocks_dirty,true);
-  mi_bitmap_init(&arena->blocks_purge,true);
+  mi_bitmap_init(&arena->slices_free,true);
+  mi_bitmap_init(&arena->slices_committed,true);
+  mi_bitmap_init(&arena->slices_dirty,true);
+  mi_bitmap_init(&arena->slices_purge,true);
   for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
-    mi_bitmap_init(&arena->blocks_abandoned[i],true);
+    mi_bitmap_init(&arena->slices_abandoned[i],true);
   }
 
-  // reserve our meta info (and reserve blocks outside the memory area)
-  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
+  // reserve our meta info (and reserve slices outside the memory area)
+  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
   if (memid.initially_committed) {
-    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
+    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_committed, 0, arena->slice_count);
   }
   else {
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, 0, info_slices, NULL);
   }
-  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
+  mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, 0, info_slices, NULL);
 
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 }
@@ -880,9 +878,9 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
 // Reserve a range of regular OS memory
 int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
   mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_ARENA_BLOCK_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
   if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
@@ -919,15 +917,15 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap, bool invert) {
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
   _mi_output_message("%s%s:\n", prefix, header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
-  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
+  for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) {
     char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
     for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
-      if (bit_count < block_count) {
+      if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
         size_t xcount = mi_debug_show_bfield(bfield, buf + k);
@@ -939,10 +937,10 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
       else {
         _mi_memset(buf + k, ' ', MI_BFIELD_BITS);
         k += MI_BFIELD_BITS;
-      }      
-      bit_count += MI_BFIELD_BITS;      
+      }
+      bit_count += MI_BFIELD_BITS;
     }
-    
+
     _mi_output_message("%s  %s\n", prefix, buf);
   }
   _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
@@ -953,26 +951,26 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
   MI_UNUSED(show_abandoned);
   size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
   size_t free_total = 0;
-  size_t block_total = 0;
+  size_t slice_total = 0;
   //size_t abandoned_total = 0;
   size_t purge_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
-    block_total += arena->block_count;
-    _mi_output_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
+    slice_total += arena->slice_count;
+    _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "in-use blocks", arena->block_count, &arena->blocks_free, true);
+      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, &arena->slices_free, true);
     }
-    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed, false);
-    // todo: abandoned blocks
+    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, &arena->slices_committed, false);
+    // todo: abandoned slices
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge, false);
+      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, &arena->slices_purge, false);
     }
   }
-  if (show_inuse)     _mi_output_message("total inuse blocks    : %zu\n", block_total - free_total);
-  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_output_message("total purgeable blocks: %zu\n", purge_total);
+  if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
+  // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total);
+  if (show_purge)     _mi_output_message("total purgeable slices: %zu\n", purge_total);
 }
 
 
@@ -1066,18 +1064,18 @@ static long mi_arena_purge_delay(void) {
 }
 
 // reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+// assumes we own the area (i.e. slices_free is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) {
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
-  void* const p = mi_arena_block_start(arena, block_idx);
+  const size_t size = mi_size_of_slices(slices);
+  void* const p = mi_arena_slice_start(arena, slice_index);
   bool needs_recommit;
-  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
-    // all blocks are committed, we can purge freely
+  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slices)) {
+    // all slices are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
   else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // some slices are not committed -- this can happen when a partially committed slice is freed
     // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
     // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
@@ -1086,25 +1084,25 @@ static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, m
     if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
   }
 
-  // clear the purged blocks
-  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
+  // clear the purged slices
+  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slices, slice_index, NULL);
 
   // update committed bitmap
   if (needs_recommit) {
-    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slices, slice_index, NULL);
   }
 }
 
 
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) {
   const long delay = mi_arena_purge_delay();
   if (delay < 0) return;  // is purging allowed at all?
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, block_idx, blocks, stats);
+    mi_arena_purge(arena, slice_index, slices, stats);
   }
   else {
     // schedule decommit
diff --git a/src/page-map.c b/src/page-map.c
index f52fab10..c7d5e8b4 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 mi_decl_cache_align signed char* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
-static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_BLOCK_SIZE;
+static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static mi_memid_t  mi_page_map_memid;
 static mi_bitmap_t mi_page_map_commit;
 
@@ -20,9 +20,9 @@ static bool mi_page_map_init(void) {
   if (vbits >= 48) vbits = 47;
   // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
   //                    64 KiB for 4 GiB address space (on 32-bit)
-  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT));
-  
-  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);  
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
+
+  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
 
   mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
   _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
@@ -42,11 +42,11 @@ static bool mi_page_map_init(void) {
   return true;
 }
 
-static void mi_page_map_ensure_committed(size_t idx, size_t block_count) {
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
   // is the page map area that contains the page address committed?
   if (!mi_page_map_all_committed) {
     const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
-    const size_t commit_bit_idx_hi = (idx + block_count - 1) / mi_page_map_entries_per_commit_bit;
+    const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit;
     for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
       if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
@@ -57,12 +57,12 @@ static void mi_page_map_ensure_committed(size_t idx, size_t block_count) {
   }
 }
 
-static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) {
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
   size_t page_size;
   *page_start = mi_page_area(page, &page_size);
   if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; }  // furthest interior pointer
-  *block_count = mi_block_count_of_size(page_size);
-  return ((uintptr_t)*page_start >> MI_ARENA_BLOCK_SHIFT);
+  *slice_count = mi_slice_count_of_size(page_size);
+  return ((uintptr_t)*page_start >> MI_ARENA_SLICE_SHIFT);
 }
 
 
@@ -73,13 +73,13 @@ void _mi_page_map_register(mi_page_t* page) {
   }
   mi_assert(_mi_page_map!=NULL);
   uint8_t* page_start;
-  size_t   block_count;
-  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
-  
-  mi_page_map_ensure_committed(idx, block_count);
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+
+  mi_page_map_ensure_committed(idx, slice_count);
 
   // set the offsets
-  for (int i = 0; i < (int)block_count; i++) {
+  for (int i = 0; i < (int)slice_count; i++) {
     mi_assert_internal(i < 128);
     _mi_page_map[idx + i] = (signed char)(-i-1);
   }
@@ -88,19 +88,19 @@ void _mi_page_map_register(mi_page_t* page) {
 
 void _mi_page_map_unregister(mi_page_t* page) {
   mi_assert_internal(_mi_page_map != NULL);
-  
+
   // get index and count
   uint8_t* page_start;
-  size_t   block_count;
-  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
 
   // unset the offsets
-  _mi_memzero(_mi_page_map + idx, block_count);
+  _mi_memzero(_mi_page_map + idx, slice_count);
 }
 
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT);
+  uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
   if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
diff --git a/src/stats.c b/src/stats.c
index 9f7a3cf0..53b18da0 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -325,11 +325,11 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
   mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
   mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
-  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
   //mi_stat_print(&stats->segments, "segments", -1, out, arg);
   //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
   //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
-  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
+  mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, "");
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
diff --git a/test/test-stress.c b/test/test-stress.c
index 6327e995..e2133f7d 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -320,8 +320,8 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  mi_debug_show_arenas(true,true,false);
   mi_collect(true);
+  mi_debug_show_arenas(true,true,false);
   #endif
   // mi_stats_print(NULL);
 #endif

From f8d04dc2bc42efcae8f6012f2ccdef8c3056801c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 12:41:11 -0800
Subject: [PATCH 013/264] compile with clang and gcc

---
 CMakeLists.txt              |  2 --
 include/mimalloc/bits.h     |  4 +--
 include/mimalloc/internal.h |  2 +-
 include/mimalloc/types.h    |  2 +-
 src/alloc-aligned.c         |  6 ++--
 src/arena.c                 | 12 ++++----
 src/bitmap.c                | 56 ++++++++++++++++++-------------------
 src/bitmap.h                |  2 +-
 src/heap.c                  |  6 ++--
 src/init.c                  |  7 +++--
 src/os.c                    |  2 +-
 src/page-map.c              |  2 +-
 src/page.c                  | 10 +++----
 src/prim/unix/prim.c        |  2 +-
 src/static.c                |  2 --
 15 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5cb05840..04b09252 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,8 +57,6 @@ set(mi_sources
     src/page.c
     src/page-map.c
     src/random.c
-    src/segment.c
-    src/segment-map.c
     src/stats.c
     src/prim/prim.c)
 
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index d6695a00..79034c2f 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -291,7 +291,7 @@ static inline size_t mi_rotr(size_t x, size_t r) {
     // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
     const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
-    return (x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1)));
+    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
   #endif
 }
 
@@ -310,7 +310,7 @@ static inline size_t mi_rotl(size_t x, size_t r) {
     // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
     const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
-    return (x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1)))
+    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
   #endif
 }
 
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 082882bb..1c1ec2bc 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -471,7 +471,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
 static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) {
   size_t psize;
   uint8_t* start = mi_page_area(page, &psize);
-  return (start <= p && p < start + psize);
+  return (start <= (uint8_t*)p && (uint8_t*)p < start + psize);
 }
 
 static inline bool mi_page_is_in_arena(const mi_page_t* page) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ac0a5fc4..cc8deeb6 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -125,7 +125,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
-#define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
+#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
 
 #define MI_ARENA_MIN_OBJ_BLOCKS           (1)
 #define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 43dc2d36..84f49ec6 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -59,9 +59,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
   void* p;
   size_t oversize;
   if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
-    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    // use OS allocation for very large alignment and allocate inside a huge page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned at a point in the
+    // first (and single) page such that the page info is `MI_ARENA_SLICE_SIZE` bytes before it (and can be found in the _mi_page_map).
     if mi_unlikely(offset != 0) {
       // todo: cannot support offset alignment for very large alignments yet
 #if MI_DEBUG > 0
diff --git a/src/arena.c b/src/arena.c
index 7b5256b6..b59f8ad3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -652,7 +652,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
 
 void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size >= 0 && stats != NULL);
+  mi_assert_internal(size > 0 && stats != NULL);
   mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
@@ -675,8 +675,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     size_t slice_index;
     mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count);
     mi_assert_internal(size==1);
-    mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= p);
-    mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p);
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -796,7 +796,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
 
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
-  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
+  mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
   mi_assert(start!=NULL);
   if (start==NULL) return false;
@@ -849,7 +849,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_bitmap_init(&arena->slices_committed,true);
   mi_bitmap_init(&arena->slices_dirty,true);
   mi_bitmap_init(&arena->slices_purge,true);
-  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+  for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
     mi_bitmap_init(&arena->slices_abandoned[i],true);
   }
 
@@ -924,7 +924,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) {
     char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
-    for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+    for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
diff --git a/src/bitmap.c b/src/bitmap.c
index bb54af6b..fe44bb67 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -22,9 +22,9 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) {
   return mi_ctz(x);
 }
 
-static inline size_t mi_bfield_clz(mi_bfield_t x) {
-  return mi_clz(x);
-}
+//static inline size_t mi_bfield_clz(mi_bfield_t x) {
+//  return mi_clz(x);
+//}
 
 // find the least significant bit that is set (i.e. count trailing zero's)
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
@@ -124,11 +124,11 @@ static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b,
 }
 
 // Check if a bit is set/clear
-static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
-  return mi_bfield_atomic_is_xset_mask(set, b, mask);
-}
+// static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+//   mi_assert_internal(idx < MI_BFIELD_BITS);
+//   const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+//   return mi_bfield_atomic_is_xset_mask(set, b, mask);
+// }
 
 
 /* --------------------------------------------------------------------------------
@@ -197,7 +197,7 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz
   return all_xset;
 }
 
-// Try to atomically set/clear a sequence of `n` bits within a chunk. 
+// Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
 static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
@@ -271,7 +271,7 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu
     // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
-    const size_t chunk_idx = _tzcnt_u32(mask) / 8;            
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;
     mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
     size_t cidx;
     if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
@@ -302,9 +302,9 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk,
   return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
 }
 
-static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
-}
+// static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+//   return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
+// }
 
 /*
 // find least 1-bit in a chunk and try unset it atomically
@@ -435,19 +435,19 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
 
 
 // are all bits in a bitmap chunk set?
-static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_test_all_ones(vec);
-  #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x & chunk->bfields[i];
-  }
-  return (~x == 0);
-  #endif
-}
+// static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
+//   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+//   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+//   return _mm256_test_all_ones(vec);
+//   #else
+//   // written like this for vectorization
+//   mi_bfield_t x = chunk->bfields[0];
+//   for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+//     x = x & chunk->bfields[i];
+//   }
+//   return (~x == 0);
+//   #endif
+// }
 
 // are all bits in a bitmap chunk clear?
 static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
@@ -594,11 +594,11 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  
+
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);  
+  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
 
   return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
diff --git a/src/bitmap.h b/src/bitmap.h
index fcadc213..1a180924 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -22,7 +22,7 @@ typedef size_t mi_bfield_t;
 #define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
 #define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
 #define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
-#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_LO_BIT8                  (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
 #define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
 
 #define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
diff --git a/src/heap.c b/src/heap.c
index 8ee66055..4da3b449 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -31,7 +31,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   size_t count = 0;
   #endif
 
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+  for (int i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_t* page = pq->first;
     while(page != NULL) {
@@ -54,7 +54,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   MI_UNUSED(arg1);
   MI_UNUSED(arg2);
   MI_UNUSED(pq);
-  mi_assert_internal(mi_page_heap(page) == heap);  
+  mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_expensive(_mi_page_is_valid(page));
   return true;
 }
@@ -419,7 +419,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   // so threads may do delayed frees in either heap for a while.
   // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
   // so after this only the new heap will get delayed frees
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+  for (int i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_queue_t* append = &from->pages[i];
     size_t pcount = _mi_page_queue_append(heap, pq, append);
diff --git a/src/init.c b/src/init.c
index 40bc5c4a..1456cb4a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -33,7 +33,7 @@ const mi_page_t _mi_page_empty = {
   MI_ATOMIC_VAR_INIT(0), // xheap
   MI_ATOMIC_VAR_INIT(0), // xthread_id
   NULL, NULL, // next, prev
-  { { NULL, 0}, false, false, false, MI_MEM_NONE }  // memid
+  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -396,7 +396,8 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = 0;
+  mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
@@ -433,7 +434,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   _mi_stats_done(&heap->tld->stats);
 
   // free if not the main thread
-  if (heap != &_mi_heap_main) {    
+  if (heap != &_mi_heap_main) {
     mi_thread_data_free((mi_thread_data_t*)heap);
   }
   else {
diff --git a/src/os.c b/src/os.c
index da41d152..110d7ec6 100644
--- a/src/os.c
+++ b/src/os.c
@@ -573,7 +573,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif
 
-// Allocate MI_SEGMENT_SIZE aligned huge pages
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (psize != NULL) *psize = 0;
diff --git a/src/page-map.c b/src/page-map.c
index c7d5e8b4..07433aa3 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -107,4 +107,4 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att
   else {
     return false;
   }
-}
\ No newline at end of file
+}
diff --git a/src/page.c b/src/page.c
index f8ef641e..d91b9123 100644
--- a/src/page.c
+++ b/src/page.c
@@ -250,13 +250,13 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_page_set_heap(page, heap);
-  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) 
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
   _mi_page_free_collect(page, false); // ensure used count is up to date
 
   mi_assert_expensive(mi_page_is_valid_init(page));
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  
+
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -686,7 +686,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
     mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  
+
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -928,8 +928,8 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
 
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
-// very large requested alignments in which case we use a huge segment.
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
+// very large requested alignments in which case we use a huge singleton page.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
   mi_assert_internal(heap != NULL);
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 59421e52..5a4440c3 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -478,7 +478,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co
 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
   bool is_large = true;
   *is_zero = true;
-  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     unsigned long numa_mask = (1UL << numa_node);
     // TODO: does `mbind` work correctly for huge OS pages? should we
diff --git a/src/static.c b/src/static.c
index b34d5d42..0a8fa447 100644
--- a/src/static.c
+++ b/src/static.c
@@ -33,8 +33,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "page.c"           // includes page-queue.c
 #include "page-map.c"
 #include "random.c"
-#include "segment.c"
-#include "segment-map.c"
 #include "stats.c"
 #include "prim/prim.c"
 #if MI_OSX_ZONE

From 55b70f1588e0df2778743673f000749fe45f7a00 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 14:00:07 -0800
Subject: [PATCH 014/264] wip

---
 CMakeLists.txt              |  2 +-
 include/mimalloc/internal.h |  5 +++--
 include/mimalloc/types.h    | 16 +++++++++-------
 src/free.c                  |  7 ++++---
 src/heap.c                  |  4 ++--
 src/init.c                  | 10 +++++-----
 src/page-map.c              |  6 +++---
 7 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04b09252..2c04aea8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
       message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
     else()
-      list(APPEND mi_cflags -ftls-model=initial-exec)
+      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2)
     endif()
   endif()
   if(MI_OVERRIDE)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 1c1ec2bc..39bc23eb 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -438,17 +438,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 }
 
 
-extern signed char* _mi_page_map;
+extern uint8_t* _mi_page_map;
 
 #define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
   const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  // __builtin_prefetch((void*)(up << MI_ARENA_SLICE_SHIFT));
   const ptrdiff_t ofs = _mi_page_map[up];
   #if MI_DEBUG
   if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
   #endif
-  return (mi_page_t*)((up + ofs + 1) << MI_ARENA_SLICE_SHIFT);
+  return (mi_page_t*)((up - ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }
 
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index cc8deeb6..f82265fb 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -283,18 +283,21 @@ typedef struct mi_subproc_s mi_subproc_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
+  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
   uint16_t              reserved;          // number of blocks reserved in memory
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
+
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
+
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the blocks
 
@@ -304,7 +307,6 @@ typedef struct mi_page_s {
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
-  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
diff --git a/src/free.c b/src/free.c
index 224070fe..5dbea4a4 100644
--- a/src/free.c
+++ b/src/free.c
@@ -126,10 +126,11 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
+  if (p==NULL) return;
   mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
-  if mi_unlikely(page==NULL) return;
+  // if mi_unlikely(page==NULL) return;
+
 
-  
   const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
@@ -257,7 +258,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
     // huge pages are special as they occupy the entire segment
     // as these are large we reset the memory occupied by the page so it is available to other threads
     // (as the owning thread needs to actually free the memory later).
-    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively    
+    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively
   }
   else {
     #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
diff --git a/src/heap.c b/src/heap.c
index 4da3b449..746ba4d0 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -31,7 +31,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   size_t count = 0;
   #endif
 
-  for (int i = 0; i <= MI_BIN_FULL; i++) {
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_t* page = pq->first;
     while(page != NULL) {
@@ -419,7 +419,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   // so threads may do delayed frees in either heap for a while.
   // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
   // so after this only the new heap will get delayed frees
-  for (int i = 0; i <= MI_BIN_FULL; i++) {
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_queue_t* append = &from->pages[i];
     size_t pcount = _mi_page_queue_append(heap, pq, append);
diff --git a/src/init.c b/src/init.c
index 1456cb4a..16130af7 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,16 +14,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
+  MI_ATOMIC_VAR_INIT(0), // xthread_id
+  NULL,    // free
+  0,       // used
   0,       // capacity
   0,       // reserved capacity
+  0,       // block size shift
+  0,       // heap tag
   { 0 },   // flags
   false,   // is_zero
   0,       // retire_expire
-  NULL,    // free
   NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // heap tag
   0,       // block_size
   NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
@@ -31,7 +32,6 @@ const mi_page_t _mi_page_empty = {
   #endif
   MI_ATOMIC_VAR_INIT(0), // xthread_free
   MI_ATOMIC_VAR_INIT(0), // xheap
-  MI_ATOMIC_VAR_INIT(0), // xthread_id
   NULL, NULL, // next, prev
   { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };
diff --git a/src/page-map.c b/src/page-map.c
index 07433aa3..624f615c 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -9,7 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 
-mi_decl_cache_align signed char* _mi_page_map = NULL;
+mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static mi_memid_t  mi_page_map_memid;
@@ -25,7 +25,7 @@ static bool mi_page_map_init(void) {
   mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
 
   mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
-  _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
+  _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
     return false;
@@ -81,7 +81,7 @@ void _mi_page_map_register(mi_page_t* page) {
   // set the offsets
   for (int i = 0; i < (int)slice_count; i++) {
     mi_assert_internal(i < 128);
-    _mi_page_map[idx + i] = (signed char)(-i-1);
+    _mi_page_map[idx + i] = (i+1);
   }
 }
 

From 9ebe941ce0cb4705e584c7c638b7345458c6e79c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 20:21:32 -0800
Subject: [PATCH 015/264] first version that passes the make test

---
 include/mimalloc/internal.h | 36 +++++++++++++----
 include/mimalloc/types.h    | 20 +++++-----
 src/alloc-aligned.c         | 40 +++++++++----------
 src/arena.c                 | 78 +++++++++++++++++++++++++------------
 src/bitmap.c                |  2 +-
 src/free.c                  | 13 ++++---
 src/page-map.c              | 13 ++++---
 src/page.c                  | 26 ++++++++++---
 test/test-api.c             | 14 +++----
 test/test-stress.c          |  4 +-
 10 files changed, 155 insertions(+), 91 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 39bc23eb..02a62bec 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -440,16 +440,34 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 
 extern uint8_t* _mi_page_map;
 
-#define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
+static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
+  #if 1
+  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) *valid = (ofs != 0);
+  return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+  #else
+  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const uintptr_t up   = idx << MI_ARENA_SLICE_SHIFT;
+  __builtin_prefetch((void*)up);
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) *valid = (ofs != 0);
+  return (mi_page_t*)(up - ((ofs - 1) << MI_ARENA_SLICE_SHIFT));
+  #endif
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  bool valid;
+  mi_page_t* const page = _mi_ptr_page_ex(p,&valid);
+  return (valid ? page : NULL);
+}
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
-  // __builtin_prefetch((void*)(up << MI_ARENA_SLICE_SHIFT));
-  const ptrdiff_t ofs = _mi_page_map[up];
   #if MI_DEBUG
-  if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
+  return _mi_checked_ptr_page(p);
+  #else
+  return _mi_ptr_page_ex(p,NULL);
   #endif
-  return (mi_page_t*)((up - ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }
 
 
@@ -509,12 +527,13 @@ static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
 
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
   if (heap != NULL) {
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
     page->heap_tag = heap->tag;
     mi_atomic_store_release(&page->xthread_id, heap->thread_id);
   }
   else {
+    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
     mi_atomic_store_release(&page->xthread_id,0);
   }
 }
@@ -578,11 +597,12 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
 }
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
+  // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
   return (mi_page_thread_id(page) == 0);
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE);
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
 }
 
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index f82265fb..271c7efb 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -123,15 +123,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
 #endif
 
+#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
-#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_ARENA_MIN_OBJ_BLOCKS           (1)
-#define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
+#define MI_ARENA_MIN_OBJ_SLICES           (1)
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_SIZE_BITS)              // for now, cannot cross bit field boundaries.. todo: make it at least MI_BITMAP_CHUNK_BITS ? (16 MiB)
+// #define MI_ARENA_MAX_OBJ_BLOCKS        (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
 
-#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
-#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
 
 #define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
@@ -144,9 +145,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 
 
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_SLICE_ALIGN)
-
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 
@@ -318,8 +316,10 @@ typedef struct mi_page_s {
 // Object sizes
 // ------------------------------------------------------
 
-#define MI_PAGE_ALIGN                     (64)
-#define MI_PAGE_INFO_SIZE                 (2*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 84f49ec6..9673334a 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -16,12 +16,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 
 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_PAGE_ALIGN` are allocated aligned to their size
+  // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size
   mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
   if (alignment > size) return false;
-  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
   const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_PAGE_ALIGN && (bsize & (alignment-1)) == 0);
+  return (bsize <= MI_PAGE_MIN_BLOCK_ALIGN && (bsize & (alignment-1)) == 0);
 }
 
 #if MI_GUARDED
@@ -39,9 +38,9 @@ static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, si
 
 static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
   const size_t rate = heap->guarded_sample_rate;
-  heap->guarded_sample_rate = 0;
+  if (rate != 0) { heap->guarded_sample_rate = 0; }   // don't write to constant heap_empty
   void* p = _mi_heap_malloc_zero(heap, size, zero);
-  heap->guarded_sample_rate = rate;
+  if (rate != 0) { heap->guarded_sample_rate = rate; }
   return p;
 }
 #else
@@ -58,21 +57,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
 
   void* p;
   size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (not in an arena)
-    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the page info is `MI_ARENA_SLICE_SIZE` bytes before it (and can be found in the _mi_page_map).
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned 
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
     if mi_unlikely(offset != 0) {
       // todo: cannot support offset alignment for very large alignments yet
-#if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
-#endif
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
       return NULL;
     }
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
     // note: no guarded as alignment > 0
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
-    // zero afterwards as only the area from the aligned_p may be committed!
+    p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block
     if (p == NULL) return NULL;
   }
   else {
@@ -113,13 +111,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
   #endif
 
   // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
-  }
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
diff --git a/src/arena.c b/src/arena.c
index b59f8ad3..a2d3f560 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -354,9 +354,9 @@ static mi_decl_noinline void* mi_arena_try_alloc(
   bool commit, bool allow_large,
   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_BLOCKS);
+  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
-
+  
   // try to find free slices in the arena's
   void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
   if (p != NULL) return p;
@@ -469,33 +469,48 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   return NULL;
 }
 
-static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment,
+                                            mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
   const bool allow_large = true;
   const bool commit = true;
-  const size_t alignment = 1;
-
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
+  
   // try to allocate from free space in arena's
   mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
-  if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
-    page = (mi_page_t*)mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
+  if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?       
+      !os_align &&                            // not large alignment
+      slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
+  {
+    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, &memid, tld);
   }
 
   // otherwise fall back to the OS
   if (page == NULL) {
-    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    if (os_align) {
+      // note: slice_count already includes the page
+      mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    }
+    else {
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    }
   }
 
   if (page == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
   // claimed free slices: initialize the page partly
-  _mi_memzero_aligned(page, sizeof(*page));
-  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
-  const size_t reserved = (mi_size_of_slices(slice_count) - MI_PAGE_INFO_SIZE) / block_size;
+  if (!memid.initially_zero) { _mi_memzero_aligned(page, sizeof(*page)); }
+  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
+  const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
   page->reserved = (uint16_t)reserved;
-  page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
   page->memid = memid;
   page->free_is_zero = memid.initially_zero;
@@ -523,7 +538,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(slice_count, block_size, req_arena_id, tld);
+  page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -534,18 +549,27 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
 }
 
 
-static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
-  MI_UNUSED(heap); MI_UNUSED(block_size); MI_UNUSED(page_alignment);
-  _mi_error_message(EINVAL, "singleton page is not yet implemented\n");
-  return NULL;
+static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
+  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_tld_t* const tld = heap->tld;
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  
+  mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);
+  if (page == NULL) return NULL;
+  
+  mi_assert(page != NULL);
+  mi_assert(page->reserved == 1);  
+  return page;  
 }
 
 
-mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
   mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    mi_assert_internal(_mi_is_power_of_two(page_alignment));
-    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    mi_assert_internal(_mi_is_power_of_two(block_alignment));
+    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
   }
   else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
     page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
@@ -557,7 +581,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
     page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
   }
   else {
-    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
   }
   // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
@@ -598,7 +622,10 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
 
 bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); }
-  if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned
+  // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned (anymore)
+   
+  // note: we can access the page even it is in the meantime reclaimed by another thread since
+  // we only call this when on free (and thus there is still an object alive in the page)
   mi_memid_t memid = page->memid;
   if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
 
@@ -623,11 +650,12 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   }
   else {
     // A page in OS or external memory
+    if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false;
+
     // we use the thread_id to atomically grab ownership
-    // TODO: respect the subproc -- do we need to add this to the page?
     mi_threadid_t abandoned_thread_id = 0;
     if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) {
-      // we unabandoned partly
+      // we got it atomically
       _mi_page_reclaim(heap, page);
       mi_assert_internal(!mi_page_is_abandoned(page));
       return true;
diff --git a/src/bitmap.c b/src/bitmap.c
index fe44bb67..dd1afe75 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -263,7 +263,7 @@ restore:
 // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
 // todo: try neon version
 static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
-#if 0 && defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+#if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   while (true) {
     const __m256i vec   = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp  = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
diff --git a/src/free.c b/src/free.c
index 5dbea4a4..c7d92292 100644
--- a/src/free.c
+++ b/src/free.c
@@ -115,7 +115,7 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
   #endif
   mi_page_t* const page = _mi_ptr_page(p);
   #if MI_DEBUG
-  if (page == MI_PAGE_PTR_INVALID) {
+  if (page == NULL && p != NULL) {
     _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
   }
   #endif
@@ -126,11 +126,9 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  if (p==NULL) return;
   mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
-  // if mi_unlikely(page==NULL) return;
-
-
+  if mi_unlikely(page==NULL) return;
+  
   const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
@@ -235,11 +233,14 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
   {
     // the page is abandoned, try to reclaim it into our heap
     if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
-      mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
+      mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));      
       // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;
     }
+    else {
+      mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
+    }
   }
 
   // The padding check may access the non-thread-owned page for the key values.
diff --git a/src/page-map.c b/src/page-map.c
index 624f615c..a3e9a649 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -37,7 +37,8 @@ static bool mi_page_map_init(void) {
   // commit the first part so NULL pointers get resolved without an access violation
   if (!mi_page_map_all_committed) {
     _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
-    _mi_page_map[0] = -1; // so _mi_ptr_page(NULL) == NULL
+    _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+    mi_assert_internal(_mi_ptr_page(NULL)==NULL);
   }
   return true;
 }
@@ -60,9 +61,9 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
   size_t page_size;
   *page_start = mi_page_area(page, &page_size);
-  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; }  // furthest interior pointer
-  *slice_count = mi_slice_count_of_size(page_size);
-  return ((uintptr_t)*page_start >> MI_ARENA_SLICE_SHIFT);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return ((uintptr_t)page >> MI_ARENA_SLICE_SHIFT);
 }
 
 
@@ -79,9 +80,9 @@ void _mi_page_map_register(mi_page_t* page) {
   mi_page_map_ensure_committed(idx, slice_count);
 
   // set the offsets
-  for (int i = 0; i < (int)slice_count; i++) {
+  for (size_t i = 0; i < slice_count; i++) {
     mi_assert_internal(i < 128);
-    _mi_page_map[idx + i] = (i+1);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index d91b9123..af55b3b3 100644
--- a/src/page.c
+++ b/src/page.c
@@ -41,9 +41,10 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page) == page);
   size_t count = 0;
   while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
     count++;
     head = mi_block_next(page, head);
   }
@@ -123,7 +124,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
       mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
   }
@@ -258,7 +259,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
 
   // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
   mi_page_queue_push(heap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
@@ -279,6 +280,15 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   }
   if (mi_page_is_abandoned(page)) {
     _mi_page_reclaim(heap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        mi_page_extend_free(heap, page);
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
   }
   else if (pq != NULL) {
     mi_page_queue_push(heap, pq, page);
@@ -295,7 +305,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
   return page;
 }
 
@@ -713,7 +723,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
 -------------------------------------------------------------*/
 
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (0)
+#define MI_MAX_CANDIDATE_SEARCH  (8)
 
 
 // Find a page with free blocks of `page->block_size`.
@@ -723,7 +733,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   #if MI_STAT
   size_t count = 0;
   #endif
+  #if MI_MAX_CANDIDATE_SEARCH > 1
   size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  #endif
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 
@@ -793,17 +805,21 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
       mi_assert_internal(mi_page_is_expandable(page));
       mi_page_extend_free(heap, page);
     }
+    mi_assert_internal(mi_page_immediate_available(page));
   }
 
   if (page == NULL) {
     _mi_heap_collect_retired(heap, false); // perhaps make a page available
     page = mi_page_fresh(heap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
       page = mi_page_queue_find_free_ex(heap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     }
   }
   else {
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     // move the page to the front of the queue
     mi_page_queue_move_to_front(heap, pq, page);
     page->retire_expire = 0;
diff --git a/test/test-api.c b/test/test-api.c
index 15484544..ee7c56bb 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)
 
 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN
 
 #include "testhelper.h"
 
@@ -169,7 +169,7 @@ int main(void) {
   /*
   CHECK_BODY("malloc-aligned6") {
     bool ok = true;
-    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
       void* ps[8];
       for (int i = 0; i < 8 && ok; i++) {
         ps[i] = mi_malloc_aligned(align*13  // size
@@ -186,16 +186,16 @@ int main(void) {
   };
   */
   CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
     mi_free(p);
-    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
   };
   CHECK_BODY("malloc-aligned8") {
     bool ok = true;
     for (int i = 0; i < 5 && ok; i++) {
       int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
       mi_free(p);
     }
     result = ok;
@@ -203,7 +203,7 @@ int main(void) {
   CHECK_BODY("malloc-aligned9") { // test large alignments
     bool ok = true;
     void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
     for (int i = 0; i < 28 && ok; i++) {
       int align = (1 << i);
       for (int j = 0; j < 8 && ok; j++) {
diff --git a/test/test-stress.c b/test/test-stress.c
index e2133f7d..76dfe877 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -42,7 +42,7 @@ static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
 static int THREADS = 4;
-static int SCALE   = 20;
+static int SCALE   = 100;
 static int ITER    = 20;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
@@ -54,7 +54,7 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 
 #define STRESS                // undefine for leak test
 
-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
 static bool   main_participates = false;       // main thread participates as a worker too

From 8f2a5864b8c88913ce6d68f8bd7c40f1aae230f2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 30 Nov 2024 22:54:57 -0800
Subject: [PATCH 016/264] pass all debug tests

---
 include/mimalloc/internal.h |  2 +-
 src/alloc.c                 |  6 +++++-
 src/arena.c                 | 20 +++++++++++++++++++-
 src/os.c                    | 16 +++++++++++-----
 src/page-map.c              | 12 ++++++++++--
 src/page.c                  |  4 ++++
 6 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 02a62bec..01b7076b 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -463,7 +463,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
 }
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  #if MI_DEBUG
+  #if 1 // MI_DEBUG
   return _mi_checked_ptr_page(p);
   #else
   return _mi_ptr_page_ex(p,NULL);
diff --git a/src/alloc.c b/src/alloc.c
index 00f6d1a4..840d34fe 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -30,7 +30,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+  if (page->block_size != 0) { // not the empty heap
+    mi_assert_internal(mi_page_block_size(page) >= size);
+    mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+    mi_assert_internal(_mi_ptr_page(page)==page);
+  }
 
   // check the free list
   mi_block_t* const block = page->free;
diff --git a/src/arena.c b/src/arena.c
index a2d3f560..66f83d4f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -462,6 +462,9 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       mi_assert_internal(mi_page_block_size(page) == block_size);
       mi_assert_internal(!mi_page_is_full(page));
       mi_assert_internal(mi_page_is_abandoned(page));
+      mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+      mi_assert_internal(_mi_ptr_page(page)==page);
+      mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
       return page;
     }
   }
@@ -521,6 +524,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     page->block_size_shift = 0;
   }
   _mi_page_map_register(page);
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
 
   mi_assert_internal(mi_page_block_size(page) == block_size);
   mi_assert_internal(mi_page_is_abandoned(page));
@@ -561,6 +566,9 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si
   
   mi_assert(page != NULL);
   mi_assert(page->reserved == 1);  
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+
   return page;  
 }
 
@@ -584,6 +592,11 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
     page = mi_singleton_page_alloc(heap, block_size, block_alignment);
   }
   // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(block_alignment <= MI_PAGE_MAX_OVERALLOC_ALIGN || _mi_is_aligned(mi_page_start(page), block_alignment));
+
   return page;
 }
 
@@ -601,11 +614,14 @@ void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) {
 void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(page->next==NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+
 
   if (mi_page_all_free(page)) {
     _mi_arena_page_free(page, tld);
   }
-  else if (page->memid.memkind==MI_MEM_ARENA) {
+  else if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
     size_t slice_index;
@@ -622,6 +638,8 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
 
 bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); }
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
   // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned (anymore)
    
   // note: we can access the page even it is in the meantime reclaimed by another thread since
diff --git a/src/os.c b/src/os.c
index 110d7ec6..931abc7f 100644
--- a/src/os.c
+++ b/src/os.c
@@ -219,20 +219,26 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
+  const bool use_overalloc = (alignment > mi_os_mem_config.alloc_granularity && alignment <= size/8);
+
   // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
-  if (p == NULL) return NULL;
+  void* p = NULL;
+  if (!use_overalloc) {
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
+  }
 
   // aligned already?
-  if (((uintptr_t)p % alignment) == 0) {
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
     *base = p;
   }
   else {
     // if not aligned, free it, overallocate, and unmap around it
     #if !MI_TRACK_ASAN
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    if (!use_overalloc) {
+      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    }
     #endif
-    mi_os_prim_free(p, size, commit, stats);
+    if (p != NULL) { mi_os_prim_free(p, size, commit, stats); }
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
diff --git a/src/page-map.c b/src/page-map.c
index a3e9a649..15578301 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -36,7 +36,9 @@ static bool mi_page_map_init(void) {
   }
   // commit the first part so NULL pointers get resolved without an access violation
   if (!mi_page_map_all_committed) {
-    _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
+    bool is_zero;
+    _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL);
+    if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); }
     _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
     mi_assert_internal(_mi_ptr_page(NULL)==NULL);
   }
@@ -51,7 +53,11 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
     for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
       if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
-        _mi_os_commit(_mi_page_map + (i*mi_page_map_entries_per_commit_bit), mi_page_map_entries_per_commit_bit, NULL, NULL);
+        bool is_zero;
+        uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
+        const size_t   size = mi_page_map_entries_per_commit_bit;
+        _mi_os_commit(start, size, &is_zero, NULL);
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); }
         mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL);
       }
     }
@@ -69,6 +75,8 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t*
 
 
 void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN));
   if mi_unlikely(_mi_page_map == NULL) {
     if (!mi_page_map_init()) return;
   }
diff --git a/src/page.c b/src/page.c
index af55b3b3..243d9bf7 100644
--- a/src/page.c
+++ b/src/page.c
@@ -745,7 +745,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
     #if MI_STAT
     count++;
     #endif
+    #if MI_MAX_CANDIDATE_SEARCH > 1
     candidate_count++;
+    #endif
 
     // collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
@@ -978,6 +980,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   mi_assert_internal(mi_page_immediate_available(page));
   mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
 
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
   if mi_unlikely(zero && mi_page_is_huge(page)) {

From 1d7a9f62a517e6667b50175bce4766b1c5d0f495 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 1 Dec 2024 12:54:16 -0800
Subject: [PATCH 017/264] bug fixes

---
 include/mimalloc/internal.h |  2 +-
 src/arena.c                 | 17 +++++++----------
 src/free.c                  | 35 ++++++++++++++++++++---------------
 src/init.c                  |  3 +--
 src/os.c                    |  7 ++++---
 test/test-stress.c          |  2 +-
 6 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 01b7076b..ec106047 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -598,7 +598,7 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_page_thread_id(page) == 0);
+  return (mi_atomic_load_acquire(&page->xthread_id) == 0);
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
diff --git a/src/arena.c b/src/arena.c
index 66f83d4f..a713a110 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -646,11 +646,12 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   // we only call this when on free (and thus there is still an object alive in the page)
   mi_memid_t memid = page->memid;
   if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
+  if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false;
 
   if mi_likely(memid.memkind == MI_MEM_ARENA) {
     size_t slice_index;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL);
-    if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
+    //if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
 
     // don't reclaim more from a `free` call than half the current segments
     // this is to prevent a pure free-ing thread to start owning too many segments
@@ -665,6 +666,11 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
       mi_assert_internal(!mi_page_is_abandoned(page));
       return true;
     }
+    else {
+      if (mi_page_is_abandoned(page)) {
+        mi_assert(false);
+      }
+    }
   }
   else {
     // A page in OS or external memory
@@ -1089,15 +1095,6 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 
 
 
-/* -----------------------------------------------------------
-  Abandoned pages
------------------------------------------------------------ */
-
-void mi_arena_page_abandon(mi_page_t* page) {
-  mi_assert_internal(mi_page_is_abandoned(page));
-  if (mi_page_is_full(page)) {}
-}
-
 
 
 /* -----------------------------------------------------------
diff --git a/src/free.c b/src/free.c
index c7d92292..f0ce8c22 100644
--- a/src/free.c
+++ b/src/free.c
@@ -225,24 +225,29 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
   // first see if the page was abandoned and if we can reclaim it into our thread
-  if (mi_page_is_abandoned(page) &&
-      (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
-       mi_page_is_singleton(page)  // only one block, and we are free-ing it
-      ) &&
-      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
-  {
-    // the page is abandoned, try to reclaim it into our heap
-    if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
-      mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));      
-      // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
-    else {
-      mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
+  if (mi_page_is_abandoned(page)) {
+    if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
+      mi_page_is_singleton(page)) {  // only one block, and we are free-ing it
+      if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
+      {
+        // the page is abandoned, try to reclaim it into our heap
+        if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
+          mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
+          // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
+          mi_free(block);  // recursively free as now it will be a local free in our heap
+          return;
+        }
+        else {
+          if (mi_page_is_abandoned(page)) {
+            mi_assert(false);
+          }
+          mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
+        }
+      }
     }
   }
 
+
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
diff --git a/src/init.c b/src/init.c
index 16130af7..2378b3c8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -396,8 +396,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = 0;
-  mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
diff --git a/src/os.c b/src/os.c
index 931abc7f..0aa0a681 100644
--- a/src/os.c
+++ b/src/os.c
@@ -219,11 +219,12 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  const bool use_overalloc = (alignment > mi_os_mem_config.alloc_granularity && alignment <= size/8);
+  // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste).
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64);
 
   // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
   void* p = NULL;
-  if (!use_overalloc) {
+  if (try_direct_alloc) {
     p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
   }
 
@@ -234,7 +235,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   else {
     // if not aligned, free it, overallocate, and unmap around it
     #if !MI_TRACK_ASAN
-    if (!use_overalloc) {
+    if (try_direct_alloc) {
       _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
     }
     #endif
diff --git a/test/test-stress.c b/test/test-stress.c
index 76dfe877..9a89744e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -36,7 +36,7 @@ static int ITER    = 400;
 static int THREADS = 8;
 static int SCALE   = 25;
 static int ITER    = 20;
-#elif defined(xMI_GUARDED)     // with debug guard pages reduce parameters to stay within the azure pipeline limits
+#elif defined(MI_GUARDED)     // with debug guard pages reduce parameters to stay within the azure pipeline limits
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;

From 2f789aae9a1ed271e3feb22e4ead04db809e4e2e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 1 Dec 2024 16:26:59 -0800
Subject: [PATCH 018/264] wip: cannot compile

---
 include/mimalloc/internal.h | 84 +++++++++++++++++++------------------
 include/mimalloc/types.h    | 20 +++++----
 src/bitmap.c                | 45 ++++++++++++++++++++
 src/bitmap.h                | 28 ++++++++++++-
 src/free.c                  | 81 +++++++++++++++++++++++------------
 5 files changed, 181 insertions(+), 77 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ec106047..84244c21 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -92,11 +92,13 @@ bool       _mi_preloading(void);           // true while the C runtime is not in
 void       _mi_thread_done(mi_heap_t* heap);
 void       _mi_thread_data_collect(void);
 void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
-size_t      _mi_thread_seq_id(void) mi_attr_noexcept;
+size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
+
 mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
-void       _mi_heap_guarded_init(mi_heap_t* heap);
+void          _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
@@ -180,8 +182,6 @@ void       _mi_heap_delayed_free_all(mi_heap_t* heap);
 bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
 void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
 size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
 void       _mi_deferred_free(mi_heap_t* heap, bool force);
 
@@ -426,6 +426,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) {
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
 
+static inline mi_tld_t* _mi_tld(void) {
+  return mi_heap_get_default()->tld;
+}
+
 /* -----------------------------------------------------------
   Pages
 ----------------------------------------------------------- */
@@ -507,53 +511,53 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
+//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+//  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+//  if (heap != NULL) {
+//    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
+//    page->heap_tag = heap->tag;
+//    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+//  }
+//  else {
+//    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
+//    mi_atomic_store_release(&page->xthread_id,0);
+//  }
+//}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~1);
+}
+static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
+  return ((tf & 1) == 0);
+}
+static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
+  return (mi_thread_free_t)((uintptr_t)block | (owned ? 0 : 1));
+}
+
+
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
 }
 
-static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
-}
-
-// Heap access
-static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+// Owned?
+static inline bool mi_page_is_owned(const mi_page_t* page) {
+  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
 }
 
+// Thread id of thread that owns this page
 static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
   return mi_atomic_load_relaxed(&page->xthread_id);
 }
 
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  if (heap != NULL) {
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    page->heap_tag = heap->tag;
-    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
-  }
-  else {
-    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
-    mi_atomic_store_release(&page->xthread_id,0);
-  }
-}
 
-// Thread free flag helpers
-static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
-  return (mi_block_t*)(tf & ~0x03);
-}
-static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
-  return (mi_delayed_t)(tf & 0x03);
-}
-static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
-  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
-}
-static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
-  return mi_tf_make(mi_tf_block(tf),delayed);
-}
-static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
-  return mi_tf_make(block, mi_tf_delayed(tf));
-}
+//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+//  return mi_tf_make(mi_tf_block(tf),delayed);
+//}
+//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+//  return mi_tf_make(block, mi_tf_delayed(tf));
+//}
 
 // are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 271c7efb..7329cb86 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -216,13 +216,14 @@ typedef struct mi_block_s {
 #endif
 
 
-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
-} mi_delayed_t;
+// The owned flags are used for efficient multi-threaded free-ing
+// When we push on the page thread free queue of an abandoned page,
+// we also atomically get to own it. This is needed to atomically
+// abandon a page (while other threads could concurrently free blocks in it).
+typedef enum mi_owned_e {
+  MI_OWNED              = 0, // some heap owns this page
+  MI_ABANDONED          = 1, // the page is abandoned
+} mi_owned_t;
 
 
 // The `in_full` and `has_aligned` page flags are put in a union to efficiently
@@ -247,7 +248,7 @@ typedef union mi_page_flags_s {
 #endif
 
 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// We use the bottom bit of the pointer for `mi_owned_t` flags
 typedef uintptr_t mi_thread_free_t;
 
 // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
@@ -304,10 +305,11 @@ typedef struct mi_page_s {
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
+  //  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+  mi_subproc_t*         subproc;           // sub-process of this heap
   mi_memid_t            memid;             // provenance of the page memory
 } mi_page_t;
 
diff --git a/src/bitmap.c b/src/bitmap.c
index dd1afe75..5cce6bfa 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -693,3 +693,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
   mi_bitmap_forall_set_chunks_end();
   return false;
 }
+
+
+
+mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx);
+
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) {
+  size_t set_idx;
+  size_t start = tseq % MI_BFIELD_BITS;
+  size_t epoch = mi_atomic_load_acquire(&pairmap->epoch);
+  mi_bfield_t any_set = mi_bfield_rotate_right(mi_atomic_load_relaxed(&pairmap->any_set), start);
+  while (mi_bfield_find_least_bit(any_set, &set_idx)) {
+    size_t chunk_idx = 2*((set_idx + start) % MI_BFIELD_BITS);
+    {
+      // look at chunk_idx and chunck_idx+1
+      mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx];
+      mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1];
+      size_t cidx;
+      if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) {
+        *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS);
+        return true;
+      }
+      else {
+        if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) {
+          *pidx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx;
+          mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS);
+          return true;
+        }
+        else if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) {
+
+          mi_bfield_atomic_xset(MI_BIT_CLEAR, &pairmap->any_set, chunk_idx/2);
+        }
+      }
+      else {
+        if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+          mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, chunk_idx);
+        }
+      }
+    }
+    start += set_idx+1;    /* so chunk_idx stays valid */ 
+    any_set >>= set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ 
+    any_set >>= 1;
+  }
+}
diff --git a/src/bitmap.h b/src/bitmap.h
index 1a180924..2b4bfc25 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -41,7 +41,7 @@ typedef mi_decl_align(32) struct mi_bitmap_s {
 #define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
 
 /* --------------------------------------------------------------------------------
-  Bitmap
+  Atomic bitmap
 -------------------------------------------------------------------------------- */
 
 typedef bool  mi_bit_t;
@@ -89,4 +89,30 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
 
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap for a pair of bits
+-------------------------------------------------------------------------------- */
+
+typedef mi_bfield_t     mi_pair_t;
+ 
+#define MI_PAIR_CLEAR   (0)
+#define MI_PAIR_BUSY    (1)
+#define MI_PAIR_BUSYX   (2)
+#define MI_PAIR_SET     (3)
+
+typedef mi_decl_align(32) struct mi_pairmap_s {
+  mi_bitmap_chunk_t    chunks[2*MI_BFIELD_BITS];
+  _Atomic(mi_bfield_t) any_set;
+  _Atomic(size_t)      epoch;
+} mi_pairmap_t;
+
+#define MI_PAIRMAP_MAX_PAIRS  (MI_BITMAP_MAX_BITS)      // 16k pairs on 64bit, 8k pairs on 32bit
+#define MI_PAIRMAP_MAX_BITS   (2*MI_PAIRMAP_MAX_PAIRS)  
+
+mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t n, size_t tseq, size_t* pidx);
+
+
 #endif // MI_XBITMAP_H
diff --git a/src/free.c b/src/free.c
index f0ce8c22..42fcd07e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -147,39 +147,66 @@ void mi_free(void* p) mi_attr_noexcept
   }
 }
 
-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  mi_assert_internal(block!=NULL);
-  mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block");
-  mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
 
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since `used` is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
-  return true;
-}
 
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_thread_id(page)==0);
+
+  // we own the page now..
+  // first remove it from the abandoned pages in the arena
+  mi_heap_t* const heap = mi_heap_get_default();
+  _mi_arena_page_unabandon(page,heap->tld);
+
+  // collect the thread atomic free list
+  _mi_page_free_collect(page, false);  // update `used` count
+  if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page));
+
+  if (mi_page_all_free(page)) {
+    // we can free the page directly
+    _mi_arena_page_free(page, heap->tld);
+  }
+  else {
+    // the page has still some blocks in use
+    // reclaim in our heap if compatible, or otherwise abandon again
+    if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) &&
+        (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) && // we did not already terminate our thread (can this happen? yes, due to thread-local destructors for example (issue #944))
+        (page->subproc == heap->tld->subproc) &&  // don't reclaim across sub-processes        
+        mi_arena_page_try_reclaim(page)           // and we can reclaim it from the arena
+       )
+    {
+      // make it part of our heap
+      _mi_heap_page_reclaim(heap, page);
+    }
+    else {
+      // abandon again
+      _mi_arena_page_abandon(page, heap->tld);
+    }
+  }
+}
+
+// Push a block that is owned by another thread on its page-local thread free list. 
+static void mi_decl_noinline mi_free_block_delayed_mt(mi_page_t* page, mi_block_t* block)
 {
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf;
+  do {
+    tf = mi_atomic_load_relaxed(&page->xthread_free);
+    mi_block_set_next(page, block, mi_tf_block(tf));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf, tf_new));
+
+  // and atomically reclaim the page if it was abandoned
+  bool reclaimed = !mi_tf_is_owned(tf);
+  if (reclaimed) mi_free_try_reclaim_mt(page);
+}
+
+  /*
   // Try to put the block on either the page-local thread free list,
   // or the heap delayed free list (if this is the first non-local free in that page)
   mi_thread_free_t tfreex;
@@ -276,7 +303,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
   // thread_delayed free list (or heap delayed free list)
   mi_free_block_delayed_mt(page,block);
 }
-
+*/
 
 // ------------------------------------------------------
 // Usable size

From 69ac69abac87b513674f79d1217aab00e2b6ccb8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 00:31:08 -0800
Subject: [PATCH 019/264] wip: use epoch with 512bit chunks

---
 include/mimalloc/bits.h  | 119 +++++---
 include/mimalloc/types.h |   2 +-
 src/arena.c              |  61 +++--
 src/bitmap.c             | 567 +++++++++++++++++++++++++++------------
 src/bitmap.h             |  65 ++++-
 src/free.c               |   4 +-
 src/libc.c               |  10 +-
 src/options.c            |   2 +-
 8 files changed, 574 insertions(+), 256 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 79034c2f..90d56b4f 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -145,20 +145,13 @@ typedef int32_t  mi_ssize_t;
 
 size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
+uint32_t _mi_ctz_generic32(uint32_t x);
 
 static inline size_t mi_ctz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  // on x64 tzcnt is defined for 0
     uint64_t r;
     __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
     return r;
-  #elif defined(__GNUC__) && MI_ARCH_ARM64
-    uint64_t r;
-    __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc");
-    return r;
-  #elif defined(__GNUC__) && MI_ARCH_RISCV
-    size_t r;
-    __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : );
-    return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_tzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
@@ -168,6 +161,17 @@ static inline size_t mi_ctz(size_t x) {
     #else
       return (_BitScanForward64(&idx, x) ? (size_t)idx : 64);
     #endif
+  /*
+  // for arm64 and riscv, the builtin_ctz is defined for 0 as well
+  #elif defined(__GNUC__) && MI_ARCH_ARM64
+    uint64_t r;
+    __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_RISCV
+    size_t r;
+    __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : );
+    return r;
+  */
   #elif mi_has_builtin_size(ctz)
     return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS);
   #else
@@ -177,18 +181,10 @@ static inline size_t mi_ctz(size_t x) {
 }
 
 static inline size_t mi_clz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     uint64_t r;
     __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
     return r;
-  #elif defined(__GNUC__) && MI_ARCH_ARM64
-    uint64_t r;
-    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc");
-    return r;
-  #elif defined(__GNUC__) && MI_ARCH_RISCV
-    size_t r;
-    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : );
-    return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_lzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
@@ -198,6 +194,17 @@ static inline size_t mi_clz(size_t x) {
     #else
       return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64);
     #endif
+  /*
+  // for arm64 and riscv, the builtin_clz is defined for 0 as well
+  #elif defined(__GNUC__) && MI_ARCH_ARM64
+    uint64_t r;
+    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_RISCV
+    size_t r;
+    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : );
+    return r;
+  */
   #elif mi_has_builtin_size(clz)
     return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS);
   #else
@@ -206,6 +213,26 @@ static inline size_t mi_clz(size_t x) {
   #endif
 }
 
+static inline uint32_t mi_ctz32(uint32_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
+    uint32_t r;
+    __asm volatile ("tzcntl\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+    return r;
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (uint32_t)_tzcnt_u32(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (_BitScanForward(&idx, x) ? (uint32_t)idx : 32);
+  #elif mi_has_builtin(ctz) && (INT_MAX == INT32_MAX)
+    return (x!=0 ? (uint32_t)mi_builtin(ctz)(x) : 32);
+  #elif mi_has_builtin(ctzl) && (LONG_MAX == INT32_MAX)
+    return (x!=0 ? (uint32_t)mi_builtin(ctzl)(x) : 32);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return _mi_ctz_generic32(x);
+  #endif
+}
+
 #ifndef MI_HAS_FAST_BITSCAN
 #define MI_HAS_FAST_BITSCAN 1
 #endif
@@ -229,6 +256,22 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
   #endif
 }
 
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf32(uint32_t x, uint32_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ("tzcntl\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
+    return !is_zero;
+  #else
+    *idx = mi_ctz32(x);
+    return (x!=0);
+  #endif
+}
+
+
 // Bit scan reverse: find the most significant bit that is set
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
@@ -248,29 +291,6 @@ static inline bool mi_bsr(size_t x, size_t* idx) {
 }
 
 
-/* --------------------------------------------------------------------------------
-  find least/most significant bit position
--------------------------------------------------------------------------------- */
-
-// Find most significant bit index, or MI_SIZE_BITS if 0
-static inline size_t mi_find_msb(size_t x) {
-  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    unsigned long i;
-    #if MI_SIZE_BITS==32
-      return (_BitScanReverse(&i, x) ? i : 32);
-    #else
-      return (_BitScanReverse64(&i, x) ? i : 64);
-    #endif
-  #else
-    return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x));
-  #endif
-}
-
-// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's)
-static inline size_t mi_find_lsb(size_t x) {
-  return mi_ctz(x);
-}
-
 
 /* --------------------------------------------------------------------------------
   rotate
@@ -288,13 +308,26 @@ static inline size_t mi_rotr(size_t x, size_t r) {
     return _rotr64(x,(int)r);
     #endif
   #else
-    // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
     const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
     return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
   #endif
 }
 
+static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateright32)
+    return mi_builtin(rotateright32)(x, r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotr(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x >> rshift) | (x << ((-rshift) & 31)));
+  #endif
+}
+
 static inline size_t mi_rotl(size_t x, size_t r) {
   #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
     return mi_builtin(rotateleft64)(x,r);
@@ -307,7 +340,7 @@ static inline size_t mi_rotl(size_t x, size_t r) {
     return _rotl64(x,(int)r);
     #endif
   #else
-    // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
     const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
     return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 271c7efb..fe7e8227 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -120,7 +120,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 #endif
 #ifndef MI_BITMAP_CHUNK_BITS_SHIFT
-#define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
+#define MI_BITMAP_CHUNK_BITS_SHIFT        (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
 
 #define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
diff --git a/src/arena.c b/src/arena.c
index a713a110..cc2fe7b8 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -197,7 +197,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
   // set the dirty bits
   if (arena->memid.initially_zero) {
-    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count, NULL);
+    memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL);
   }
 
   // set commit state
@@ -206,7 +206,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     memid->initially_committed = true;
 
     bool all_already_committed;
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count, &all_already_committed);
+    mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &all_already_committed);
     if (!all_already_committed) {
       bool commit_zero = false;
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) {
@@ -219,13 +219,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   }
   else {
     // no need to commit, but check if already fully committed
-    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count);
+    memid->initially_committed = mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count);
   }
 
-  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count));
-  if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); }
-  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count));
-  // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count));
+  mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
+  if (commit) { mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); }
+  mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
+  // mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
 
   return p;
 }
@@ -455,10 +455,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       // found an abandoned page of the right size
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
       mi_assert_internal(mi_page_block_size(page) == block_size);
       mi_assert_internal(!mi_page_is_full(page));
       mi_assert_internal(mi_page_is_abandoned(page));
@@ -626,7 +626,7 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {
     size_t bin = _mi_bin(mi_page_block_size(page));
     size_t slice_index;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL);
-    bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_abandoned[bin], slice_index, 1, NULL);
+    bool were_zero = mi_bitmap_setN(&arena->slices_abandoned[bin], slice_index, 1, NULL);
     MI_UNUSED(were_zero); mi_assert_internal(were_zero);
     mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]);
   }
@@ -660,7 +660,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
     //  return false;
     // }
     const size_t bin = _mi_bin(page->block_size);
-    if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->slices_abandoned[bin], slice_index, 1)) {
+    if (mi_bitmap_try_clear(&arena->slices_abandoned[bin], slice_index)) {
       // we got it atomically
       _mi_page_reclaim(heap, page);
       mi_assert_internal(!mi_page_is_abandoned(page));
@@ -668,7 +668,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
     }
     else {
       if (mi_page_is_abandoned(page)) {
-        mi_assert(false);
+        // mi_assert(false);
       }
     }
   }
@@ -748,7 +748,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slice_index, slice_count, NULL);
+        mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count);
         mi_track_mem_noaccess(p, size);
         if (committed_size > 0) {
           // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
@@ -764,7 +764,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     }
 
     // and make it available to others again
-    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_free, slice_index, slice_count, NULL);
+    bool all_inuse = mi_bitmap_setN(&arena->slices_free, slice_index, slice_count, NULL);
     if (!all_inuse) {
       _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
       return;
@@ -906,14 +906,14 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   }
 
   // reserve our meta info (and reserve slices outside the memory area)
-  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
+  mi_bitmap_unsafe_setN(&arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
   if (memid.initially_committed) {
-    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_committed, 0, arena->slice_count);
+    mi_bitmap_unsafe_setN(&arena->slices_committed, 0, arena->slice_count);
   }
   else {
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, 0, info_slices, NULL);
+    mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL);
   }
-  mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, 0, info_slices, NULL);
+  mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL);
 
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 }
@@ -973,10 +973,16 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   _mi_output_message("%s%s:\n", prefix, header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
-  for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) {
-    char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf));
+  for (int i = 0; i < MI_BITMAP_CHUNK_COUNT && bit_count < slice_count; i++) {
+    char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
     for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+      if (j > 0 && (j % 4) == 0) {
+        buf[k++] = '\n';
+        _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
+        buf[k++] = ' ';
+        buf[k++] = ' ';
+      }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
@@ -987,12 +993,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
         buf[k++] = ' ';
       }
       else {
-        _mi_memset(buf + k, ' ', MI_BFIELD_BITS);
+        _mi_memset(buf + k, 'o', MI_BFIELD_BITS);
         k += MI_BFIELD_BITS;
       }
-      bit_count += MI_BFIELD_BITS;
+      bit_count += MI_BFIELD_BITS;      
     }
-
     _mi_output_message("%s  %s\n", prefix, buf);
   }
   _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
@@ -1113,7 +1118,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
   const size_t size = mi_size_of_slices(slices);
   void* const p = mi_arena_slice_start(arena, slice_index);
   bool needs_recommit;
-  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slices)) {
+  if (mi_bitmap_is_setN(&arena->slices_committed, slice_index, slices)) {
     // all slices are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
@@ -1128,11 +1133,11 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
   }
 
   // clear the purged slices
-  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slices, slice_index, NULL);
+  mi_bitmap_clearN(&arena->slices_purge, slices, slice_index);
 
   // update committed bitmap
   if (needs_recommit) {
-    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slices, slice_index, NULL);
+    mi_bitmap_clearN(&arena->slices_committed, slices, slice_index);
   }
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index dd1afe75..d24a89be 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -44,85 +44,168 @@ static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, siz
   return mi_bfield_find_least_bit((set ? ~x : x), idx);
 }
 
-// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 
+static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
+  return ((old&mask) == 0);
+}
+
+// Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
+  return ((old&mask) == mask);
+}
+
+// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
+static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   if (set) {
-    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
-    return ((old&mask) == 0);
+    return mi_bfield_atomic_set(b, idx);
   }
   else {
-    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
-    return ((old&mask) == mask);
+    return mi_bfield_atomic_clear(b, idx);
   }
 }
 
+// Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
+static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_set) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+  if (already_set!=NULL) { *already_set = ((old&mask)==mask); }
+  return ((old&mask) == 0);  
+}
+
+// Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
+static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_clear) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+  if (already_clear!=NULL) { *already_clear = ((old&mask)==0); }
+  return ((old&mask) == mask);
+}
+
 // Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-// `already_xset` is true if all bits for the mask were already set/cleared.
-static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
+static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
   mi_assert_internal(mask != 0);
   if (set) {
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
-    *already_xset = ((old&mask) == mask);
-    return ((old&mask) == 0);
+    return mi_bfield_atomic_set_mask(b, mask, already_xset);
   }
-  else { // clear
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
-    *already_xset = ((old&mask) == 0);
-    return ((old&mask) == mask);
+  else { 
+    return mi_bfield_atomic_clear_mask(b, mask, already_xset);
   }
 }
 
+
+// Tries to set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 
+// and otherwise false (leaving the bit unchanged)
+static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference
+}
+
+// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+// `allclear` is set to true if the new bfield is all zeros (and false otherwise)
+static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  return mi_bfield_atomic_clear(b, idx);
+}
+
 // Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+static inline bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
-  return mi_bfield_atomic_xset(set, b, idx);
+  return mi_bfield_atomic_xset(set, b, idx);  
 }
 
+
+// Tries to  set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
+// and false otherwise (leaving the bit field as is).
+static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  do {
+    if ((old&mask) != 0) return false; // the mask bits are no longer 0
+  } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
+  return true;
+}
+
+// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
+// and false otherwise (leaving the bit field as is).
+static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  do {
+    if ((old&mask) != mask) return false; // the mask bits are no longer set    
+  } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
+  return true;
+}
+
+
 // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
 // and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+static inline bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
   mi_assert_internal(mask != 0);
   if (set) {
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != 0) return false; // the mask bits are no longer 0
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
-    return true;
+    return mi_bfield_atomic_try_set_mask(b, mask);
   }
-  else { // clear
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != mask) return false; // the mask bits are no longer set
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
-    return true;
+  else {
+    return mi_bfield_atomic_try_clear_mask(b, mask);
   }
 }
 
+// Tries to set a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF 
+// and false otherwise (leaving the bit field as is).
+static inline bool mi_bfield_atomic_try_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_set_mask(b, mask);
+}
+
+// Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
+static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_clear_mask(b, mask);
+}
+
 // Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
+static inline bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
   mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
   const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
   return mi_bfield_atomic_try_xset_mask(set, b, mask);
 }
 
 
+// Check if all bits corresponding to a mask are set.
+static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  return ((*b & mask) == mask);
+}
+
+// Check if all bits corresponding to a mask are clear.
+static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  return ((*b & mask) == 0);
+}
+
+
 // Check if all bits corresponding to a mask are set/cleared.
-static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
   if (set) {
-    return ((*b & mask) == mask);
+    return mi_bfield_atomic_is_set_mask(b, mask);
   }
   else {
-    return ((*b & mask) == 0);
+    return mi_bfield_atomic_is_clear_mask(b, mask);
   }
 }
 
+
 // Check if a bit is set/clear
 // static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
 //   mi_assert_internal(idx < MI_BFIELD_BITS);
@@ -135,22 +218,9 @@ static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b,
  bitmap chunks
 -------------------------------------------------------------------------------- */
 
-static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
-  const size_t i   = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
-}
-
-static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
-  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
-  const size_t i         = byte_idx / MI_BFIELD_SIZE;
-  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
-}
-
-// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
+// Set/clear a sequence of `n` bits within a chunk. 
+// Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
+static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* pall_already_xset) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
@@ -164,17 +234,28 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
     const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
     bool already_xset = false;
-    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
+    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset );
     all_already_xset = all_already_xset && already_xset;
     // next field
     field++;
     idx = 0;
     n -= m;
   }
-  *palready_xset = all_already_xset;
+  if (pall_already_xset!=NULL) { *pall_already_xset = all_already_xset; }
   return all_transition;
 }
 
+
+static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_set) {
+  return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, all_allready_set);
+}
+
+static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_clear) {
+  return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, all_allready_clear);
+}
+
+
+
 // Check if a sequence of `n` bits within a chunk are all set/cleared.
 static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
@@ -197,6 +278,38 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz
   return all_xset;
 }
 
+
+
+static inline bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx);
+}
+
+static inline bool mi_bitmap_chunk_try_set(mi_bitmap_chunk_t* chunk, size_t cidx) {
+  return mi_bitmap_chunk_try_xset(MI_BIT_SET, chunk, cidx);
+}
+
+static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t cidx) {
+  return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx);
+}
+
+static inline bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) {
+  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
+  const size_t i = byte_idx / MI_BFIELD_SIZE;
+  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx);
+}
+
+static inline bool mi_bitmap_chunk_try_set8(mi_bitmap_chunk_t* chunk, size_t byte_idx) {
+  return mi_bitmap_chunk_try_xset8(MI_BIT_SET, chunk, byte_idx);
+}
+
+static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t byte_idx) {
+  return mi_bitmap_chunk_try_xset8(MI_BIT_CLEAR, chunk, byte_idx);
+}
+
 // Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
@@ -252,12 +365,19 @@ restore:
   while( field > start_field) {
     field--;
     const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
-    bool already_xset;
-    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
+    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, NULL);
   }
   return false;
 }
 
+static inline bool mi_bitmap_chunk_try_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  return mi_bitmap_chunk_try_xsetN(MI_BIT_SET, chunk, cidx, n);
+}
+
+static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n);
+}
+
 
 // find least 0/1-bit in a chunk and try to set/clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
@@ -265,8 +385,8 @@ restore:
 static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
 #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   while (true) {
-    const __m256i vec   = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp  = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
     const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
     // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
     if (mask==0) return false;
@@ -283,6 +403,46 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu
     }
     // try again
   }
+#elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512)
+  while (true) {
+    size_t chunk_idx = 0;
+    #if 1
+    __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if ((set ? _mm256_test_all_ones(vec) : _mm256_testz_si256(vec,vec))) {
+      chunk_idx += 4;
+      vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
+    }
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    chunk_idx += _tzcnt_u32(mask) / 8;
+    #else
+    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2  = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+    const __m256i cmpv  = (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256()); 
+    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    const size_t chunk_idx = _tzcnt_u64(mask) / 8;
+    #endif
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    size_t cidx;
+    if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
+      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
+        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+    // try again
+  }
 #else
   for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     size_t idx;
@@ -302,49 +462,10 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk,
   return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
 }
 
-// static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-//   return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
-// }
-
-/*
-// find least 1-bit in a chunk and try unset it atomically
-// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
-// todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  while(true) {
-    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
-    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
-    mi_assert_internal(mask != 0);
-    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-    // try again
-  }
-  #else
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    size_t idx;
-    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
-        *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-  }
-  return false;
-  #endif
+static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
 }
-*/
+
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
@@ -392,7 +513,8 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
 }
 
 
-// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
+// find a sequence of `n` bits in a chunk with all `n` (`< MI_BFIELD_BITS`!) bits set, 
+// and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
 // todo: try avx2 and neon version
 // todo: allow spanning across bfield boundaries?
@@ -410,7 +532,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
 
       if ((b&mask) == mask) { // found a match
         mi_assert_internal( ((mask << bshift) >> bshift) == mask );
-        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
+        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<<bshift)) {
           *pidx = (i*MI_BFIELD_BITS) + bshift;
           mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
           mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
@@ -450,33 +572,90 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
 // }
 
 // are all bits in a bitmap chunk clear?
-static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
+static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
   return _mm256_testz_si256( vec, vec );
+  #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512)
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  if (!_mm256_testz_si256(vec1, vec1)) return false;
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (_mm256_testz_si256(vec2, vec2));
   #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x | chunk->bfields[i];
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    if (chunk->bfields[i] != 0) return false;
   }
-  return (x == 0);
+  return true;
   #endif
 }
 
+/* --------------------------------------------------------------------------------
+  epochset (for now for 32-bit sets only)
+-------------------------------------------------------------------------------- */
+
+static void mi_epochset_split(mi_epochset_t es, uint32_t* bset, size_t* epoch) {
+  *bset = (uint32_t)es;
+  *epoch = (size_t)(es >> 32);
+}
+
+static mi_epochset_t mi_epochset_join(uint32_t bset, size_t epoch) {
+  return ((uint64_t)epoch << 32) | bset;
+}
+
+// setting a bit increases the epoch
+static void mi_epochset_set(_Atomic(mi_epochset_t)*es, size_t idx) {
+  mi_assert(idx < 32);
+  size_t epoch;
+  uint32_t bset;
+  mi_epochset_t es_new;
+  mi_epochset_t es_old = mi_atomic_load_relaxed(es);
+  do {
+    mi_epochset_split(es_old, &bset, &epoch);
+    es_new = mi_epochset_join(bset | (MI_ZU(1)<<idx), epoch+1);
+  } while (!mi_atomic_cas_weak_acq_rel(es, &es_old, es_new));
+}
+
+// clear-ing a bit only works if the epoch didn't change (so we never clear unintended)
+static bool mi_epochset_try_clear(_Atomic(mi_epochset_t)*es, size_t idx, size_t expected_epoch) {
+  mi_assert(idx < MI_EPOCHSET_BITS);
+  size_t   epoch;
+  uint32_t bset;
+  mi_epochset_t es_new;
+  mi_epochset_t es_old = mi_atomic_load_relaxed(es);
+  do {
+    mi_epochset_split(es_old, &bset, &epoch);
+    if (epoch != expected_epoch) return false;
+    es_new = mi_epochset_join(bset & ~(MI_ZU(1)<<idx), epoch);  // no need to increase the epoch for clearing
+  } while (!mi_atomic_cas_weak_acq_rel(es, &es_old, es_new));
+  return true;
+}
+
+static void mi_bitmap_anyset_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
+  mi_epochset_set(&bitmap->any_set, chunk_idx);
+}
+
+static bool mi_bitmap_anyset_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, size_t epoch) {
+  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
+  return mi_epochset_try_clear(&bitmap->any_set, chunk_idx, epoch);
+}
+
+static uint32_t mi_bitmap_anyset(mi_bitmap_t* bitmap, size_t* epoch) {
+  uint32_t bset;
+  mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, epoch);
+  return bset;
+}
+
+static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) {
+  size_t   epoch;
+  uint32_t bset;
+  mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, &epoch);
+  return epoch;
+}
+
 /* --------------------------------------------------------------------------------
  bitmap
 -------------------------------------------------------------------------------- */
-static void mi_bitmap_update_anyset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  if (set) {
-    mi_bfield_atomic_xset(MI_BIT_SET, &bitmap->any_set, idx);
-  }
-  else { // clear
-    if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[idx])) {
-      mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, idx);
-    }
-  }
-}
 
 // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
 void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
@@ -485,8 +664,8 @@ void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
   }
 }
 
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
   mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
 
@@ -495,19 +674,18 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   size_t m = MI_BITMAP_CHUNK_BITS - cidx;
   if (m > n) { m = n; }
-  bool already_xset;
-  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
-  mi_bitmap_update_anyset(set, bitmap, chunk_idx);
+  mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL);
+  mi_bitmap_anyset_set(bitmap, chunk_idx);
 
   // n can be large so use memset for efficiency for all in-between chunks
   chunk_idx++;
   n -= m;
   const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
   if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), mid_chunks * (MI_BITMAP_CHUNK_BITS/8));
+    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * (MI_BITMAP_CHUNK_BITS/8));
     const size_t end_chunk = chunk_idx + mid_chunks;
     while (chunk_idx < end_chunk) {
-      mi_bitmap_update_anyset(set, bitmap, chunk_idx);
+      mi_bitmap_anyset_set(bitmap, chunk_idx);
       chunk_idx++;
     }
     n -= (mid_chunks * MI_BITMAP_CHUNK_BITS);
@@ -517,8 +695,8 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
   if (n > 0) {
     mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
     mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
-    mi_bitmap_update_anyset(set, bitmap, chunk_idx);
+    mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
+    mi_bitmap_anyset_set(bitmap, chunk_idx);
   }
 }
 
@@ -528,12 +706,26 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_
 bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
-  bool ok = mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
-  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
-  return ok;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  if (set) {
+    // first set the anyset since it is a conservative approximation (increases epoch)
+    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // then actually try to set it atomically
+    return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx);
+  }
+  else {
+    const size_t epoch = mi_bitmap_epoch(bitmap);
+    bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx);
+    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return cleared;
+  }
 }
 
+
+
+
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
 bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
@@ -541,11 +733,23 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx%8 == 0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
-  bool ok = mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
-  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
-  return ok;
+  if (set) {
+    // first set the anyset since it is a conservative approximation (increases epoch)
+    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // then actually try to set it atomically
+    return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx);
+  }
+  else {
+    const size_t epoch = mi_bitmap_epoch(bitmap);
+    bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx);
+    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return cleared;
+  }
 }
 
+
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
@@ -561,22 +765,32 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
   mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-
-  bool ok = mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
-  if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); }
-  return ok;
+  
+  if (set) {
+    // first set the anyset since it is a conservative approximation (increases epoch)
+    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // then actually try to set it atomically
+    return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n);
+  }
+  else {
+    const size_t epoch = mi_bitmap_epoch(bitmap);
+    bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n);
+    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return cleared;
+  }
 }
 
+
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset ) {
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  bool local_already_xset;
-  if (already_xset==NULL) { already_xset = &local_already_xset;  }
-  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
-  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  
+  //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
+  //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
 
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
@@ -584,11 +798,23 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo
   mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
 
-  const bool allx = mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
-  mi_bitmap_update_anyset(set, bitmap, chunk_idx);
-  return allx;
+  if (set) {
+    // first set the anyset since it is a conservative approximation (increases epoch)
+    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // then actually try to set it atomically
+    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset);
+  }
+  else {
+    const size_t epoch = mi_bitmap_epoch(bitmap);
+    bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset);
+    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return cleared;
+  }
 }
 
+
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
@@ -605,16 +831,18 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 }
 
 
-#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \
-  { size_t _set_idx; \
-    size_t _start = tseq % MI_BFIELD_BITS; \
-    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
-    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
-      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
+#define mi_bitmap_forall_set_chunks(bitmap,tseq,name_epoch,name_chunk_idx) \
+  { uint32_t  _bit_idx; \
+    uint32_t  _start = (uint32_t)(tseq % MI_EPOCHSET_BITS); \
+    size_t    name_epoch; \
+    uint32_t _any_set = mi_bitmap_anyset(bitmap,&name_epoch); \
+    _any_set = mi_rotr32(_any_set, _start); \
+    while (mi_bsf32(_any_set,&_bit_idx)) { \
+      size_t name_chunk_idx = (_bit_idx + _start) % MI_BFIELD_BITS;
 
 #define mi_bitmap_forall_set_chunks_end() \
-      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
-      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
+      _start += _bit_idx+1;    /* so chunk_idx stays valid */ \
+      _any_set >>= _bit_idx;   /* skip scanned bits (and avoid UB with (_bit_idx+1)) */ \
       _any_set >>= 1; \
     } \
   }
@@ -623,8 +851,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -635,8 +863,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx
     else {
       // we may find that all are unset only on a second iteration but that is ok as
       // _any_set is a conservative approximation.
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
       }
     }
   }
@@ -647,8 +875,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -658,8 +886,10 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid
       return true;
     }
     else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      // we may find that all are unset only on a second iteration but that is ok as
+      // _any_set is a conservative approximation.
+      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
       }
     }
   }
@@ -672,11 +902,8 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
   // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
   // TODO: allow spanning across chunk boundaries
-  if (n == 0 || n > MI_BFIELD_BITS) return false;
-  if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);
-  if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);
-
-  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
+  if (n == 0 || n > MI_BFIELD_BITS) return false;  
+  mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
@@ -685,8 +912,10 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
       return true;
     }
     else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      // we may find that all are unset only on a second iteration but that is ok as
+      // _any_set is a conservative approximation.
+      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
       }
     }
   }
diff --git a/src/bitmap.h b/src/bitmap.h
index 1a180924..38137b0f 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -25,20 +25,26 @@ typedef size_t mi_bfield_t;
 #define MI_BFIELD_LO_BIT8                  (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
 #define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
 
+#define MI_BITMAP_CHUNK_SIZE               (MI_BITMAP_CHUNK_BITS / 8)
 #define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
 #define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
 
-typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
+// 512 bits on 64_bit
+typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s {
   _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
 } mi_bitmap_chunk_t;
 
+// for now 32   (note: with ABA instructions we can make this 64)
+#define MI_EPOCHSET_BITS        (32)
+#define MI_BITMAP_CHUNK_COUNT   MI_EPOCHSET_BITS
+typedef uint64_t  mi_epochset_t;
 
-typedef mi_decl_align(32) struct mi_bitmap_s {
-  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
-  _Atomic(mi_bfield_t)any_set;
+typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
+  mi_bitmap_chunk_t       chunks[MI_BITMAP_CHUNK_COUNT];
+  _Atomic(mi_epochset_t)  any_set;
 } mi_bitmap_t;
 
-#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
+#define MI_BITMAP_MAX_BITS  (MI_BITMAP_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
 
 /* --------------------------------------------------------------------------------
   Bitmap
@@ -52,29 +58,73 @@ typedef bool  mi_bit_t;
 void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
 
 // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
 // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset);
+
+static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_set) {
+  return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, all_already_set);
+}
+
+static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_xsetN(MI_BIT_CLEAR, bitmap, idx, n, NULL);
+}
+
 
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+
 // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
 // and false otherwise leaving the bitmask as is.
 mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
 
+static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
+}
+
+static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx);
+}
+
+
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
 mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
 
+static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
+}
+
+static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
+}
+
 // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
 mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+
 // Find a set bit in a bitmap and atomically unset it. Returns true on success,
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
@@ -89,4 +139,5 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
 
+
 #endif // MI_XBITMAP_H
diff --git a/src/free.c b/src/free.c
index f0ce8c22..1e9fe478 100644
--- a/src/free.c
+++ b/src/free.c
@@ -239,9 +239,9 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
         }
         else {
           if (mi_page_is_abandoned(page)) {
-            mi_assert(false);
+            // mi_assert(false);
           }
-          mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
+          // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
         }
       }
     }
diff --git a/src/libc.c b/src/libc.c
index 05ed7b02..20e9e38b 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -280,7 +280,7 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
 // generic trailing and leading zero count
 // --------------------------------------------------------
 
-static inline size_t mi_ctz_generic32(uint32_t x) {
+uint32_t _mi_ctz_generic32(uint32_t x) {
   // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
   static const uint8_t debruijn[32] = {
     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
@@ -290,7 +290,7 @@ static inline size_t mi_ctz_generic32(uint32_t x) {
   return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
 }
 
-static inline size_t mi_clz_generic32(uint32_t x) {
+static size_t mi_clz_generic32(uint32_t x) {
   // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
   static const uint8_t debruijn[32] = {
     31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
@@ -319,10 +319,10 @@ size_t _mi_clz_generic(size_t x) {
 size_t _mi_ctz_generic(size_t x) {
   if (x==0) return MI_SIZE_BITS;
   #if (MI_SIZE_BITS <= 32)
-    return mi_ctz_generic32((uint32_t)x);
+    return _mi_ctz_generic32((uint32_t)x);
   #else
-    const size_t count = mi_ctz_generic32((uint32_t)x);
+    const size_t count = _mi_ctz_generic32((uint32_t)x);
     if (count < 32) return count;
-    return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    return (32 + _mi_ctz_generic32((uint32_t)(x>>32)));
   #endif
 }
diff --git a/src/options.c b/src/options.c
index 8cb0d216..1e64c08e 100644
--- a/src/options.c
+++ b/src/options.c
@@ -412,7 +412,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[768];
   if (fmt==NULL) return;
   if (!mi_recurse_enter()) return;
   _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);

From c9abfe82533fc1e863375cbb17a1d642107fda46 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 16:24:40 -0800
Subject: [PATCH 020/264] wip: can run mstress

---
 include/mimalloc/types.h |  5 +++++
 src/arena.c              |  3 ++-
 src/free.c               | 19 +++++++++----------
 src/page.c               |  5 +++--
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 9b772db6..6f2f9c5f 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -321,7 +321,12 @@ typedef struct mi_page_s {
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
 #define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+
+#if MI_DEBUG && MI_SIZE_SIZE == 8
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
+#else
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
+#endif
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
diff --git a/src/arena.c b/src/arena.c
index 8a5c8f5d..ad919a6e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -538,8 +538,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
 
+  mi_page_try_claim_ownership(page);
   mi_assert_internal(mi_page_block_size(page) == block_size);
   mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_is_owned(page));
   return page;
 }
 
@@ -627,7 +629,6 @@ void _mi_arena_page_free(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
-    mi_assert_internal(!mi_page_is_singleton(page));
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
diff --git a/src/free.c b/src/free.c
index b6f75c4a..03f93cf3 100644
--- a/src/free.c
+++ b/src/free.c
@@ -158,13 +158,15 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_thread_id(page)==0);
 
   // we own the page now..
+  
+  // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+  _mi_arena_page_unabandon(page);  // this must be before collect
+
   // collect the thread atomic free list
   _mi_page_free_collect(page, false);  // update `used` count
-  if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page));
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
 
   if (mi_page_all_free(page)) {
-    // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-    _mi_arena_page_unabandon(page);
     // we can free the page directly
     _mi_arena_page_free(page);
     return;
@@ -186,17 +188,14 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
           (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)        
          )
       {
-        // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-        _mi_arena_page_unabandon(page);
-        // and make it part of our heap
+        // make it part of our heap
         _mi_heap_page_reclaim(tagheap, page);
         return;
       }      
     }
-    
-    // give up ownership as we cannot reclaim this page
-    // note: we don't need to re-abandon as we did not yet unabandon
-    _mi_page_unown(page);
+        
+    // we cannot reclaim this page.. abandon it again
+    _mi_arena_page_abandon(page);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index 2d87d80b..fdbcfff1 100644
--- a/src/page.c
+++ b/src/page.c
@@ -120,7 +120,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(page->keys[0] != 0);
   #endif
   if (!mi_page_is_abandoned(page)) {
-    mi_assert_internal(!_mi_process_is_initialized);
+    //mi_assert_internal(!_mi_process_is_initialized);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
@@ -734,7 +734,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(page->xthread_free == 1);
   mi_assert_internal(page->next == NULL);
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(page->retire_expire == 0);

From 5e95ebc7a015b7ced0f89485d5050bad4d255077 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 17:46:41 -0800
Subject: [PATCH 021/264] fix free stats

---
 include/mimalloc/internal.h |  1 +
 src/arena.c                 | 11 ++++++++---
 src/free.c                  | 21 ++++++++++++++++-----
 src/options.c               |  2 +-
 src/page.c                  |  3 ++-
 test/test-stress.c          | 16 ++++++++++++----
 6 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 8669fa80..afdfe822 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -223,6 +223,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 // void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+void        _mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
 // "libc.c"
 #include    <stdarg.h>
diff --git a/src/arena.c b/src/arena.c
index ad919a6e..194854a2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -622,6 +622,10 @@ void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(mi_page_all_free(page));
   mi_assert_internal(page->next==NULL);
 
+  #if MI_STAT > 1
+  _mi_page_free_collect(page, true);  
+  #endif
+
   #if MI_DEBUG>1
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     size_t bin = _mi_bin(mi_page_block_size(page));
@@ -665,7 +669,6 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
     // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
 
-    _mi_page_unown(page);
     bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(were_zero); mi_assert_internal(were_zero);
     mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]);
@@ -673,8 +676,9 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   else {
     // page is full (or a singleton), page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
-    _mi_page_unown(page);
-  }  
+  }
+  _mi_page_unown(page);
+  mi_stat_increase(_mi_stats_main.pages_abandoned, 1);
 }
 
 // called from `mi_free` if trying to unabandon an abandoned page
@@ -704,6 +708,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     // nothing to do    
     // TODO: maintain count of these as well?
   }
+  mi_stat_decrease(_mi_stats_main.pages_abandoned, 1);
 }
 
 /*
diff --git a/src/free.c b/src/free.c
index 03f93cf3..4ba6d6cc 100644
--- a/src/free.c
+++ b/src/free.c
@@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
 static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
 static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
-static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+// static void   _mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
 
 // ------------------------------------------------------
@@ -33,7 +33,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  if (track_stats) { mi_stat_free(page, block); }
+  if (track_stats) { _mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
@@ -199,9 +199,20 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   }
 }
 
-// Push a block that is owned by another thread on its page-local thread free list. 
+// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. 
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  _mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+  #endif
+
   // push atomically on the page thread free list
   mi_thread_free_t tf_new;
   mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
@@ -532,7 +543,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   #if (MI_STAT < 2)
   MI_UNUSED(block);
   #endif
@@ -554,7 +565,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   }
 }
 #else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
diff --git a/src/options.c b/src/options.c
index 759d096d..b69058cc 100644
--- a/src/options.c
+++ b/src/options.c
@@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 1,   UNINIT, MI_OPTION(eager_abandon) },
+  { 0,   UNINIT, MI_OPTION(eager_abandon) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page.c b/src/page.c
index fdbcfff1..8cdfd6be 100644
--- a/src/page.c
+++ b/src/page.c
@@ -189,10 +189,11 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   size_t count = 1;
   mi_block_t* tail = head;
   mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) {
     count++;
     tail = next;
   }
+
   // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
   if (count > max_count) {
     _mi_error_message(EFAULT, "corrupted thread-free list\n");
diff --git a/test/test-stress.c b/test/test-stress.c
index 9a89744e..61d1424a 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,10 +40,10 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 0
+#elif 1
 static int THREADS = 4;
 static int SCALE   = 100;
-static int ITER    = 20;
+static int ITER    = 50;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 25;      // scaling factor
@@ -227,7 +227,7 @@ static void test_stress(void) {
     run_os_threads(THREADS, &stress);
     #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
     // switch between arena and OS allocation for testing
-    mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1);
+    // mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1);
     #endif
     #ifdef HEAP_WALK
     size_t total = 0;
@@ -248,7 +248,14 @@ static void test_stress(void) {
       { printf("- iterations left: %3d\n", ITER - (n + 1)); }
     #endif
   }
-}
+  // clean up
+  for (int i = 0; i < TRANSFERS; i++) {
+    void* p = atomic_exchange_ptr(&transfer[i], NULL);
+    if (p != NULL) {
+      free_items(p);
+    }
+  }
+} 
 
 #ifndef STRESS
 static void leak(intptr_t tid) {
@@ -320,6 +327,7 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
+  // mi_debug_show_arenas(true, true, false);
   mi_collect(true);
   mi_debug_show_arenas(true,true,false);
   #endif

From fe5a3141142d27f1a0a54f95e8cb397b21ae19f3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 19:31:36 -0800
Subject: [PATCH 022/264] add base and size to OS memid

---
 include/mimalloc/bits.h     | 24 ++++++++++++
 include/mimalloc/internal.h |  4 +-
 include/mimalloc/types.h    |  1 +
 src/arena.c                 | 14 +++----
 src/bitmap.c                | 73 +++++++++++++++++++++++++++----------
 src/bitmap.h                |  6 +--
 src/options.c               |  2 +-
 src/os.c                    | 18 +++++----
 test/test-stress.c          |  2 +-
 9 files changed, 104 insertions(+), 40 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 90d56b4f..f3bbe3bc 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -237,6 +237,30 @@ static inline uint32_t mi_ctz32(uint32_t x) {
 #define MI_HAS_FAST_BITSCAN 1
 #endif
 
+
+
+static inline size_t mi_popcount(size_t x) {
+#if mi_has_builtin_size(popcount)
+  return mi_builtin_size(popcount)(x);
+#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #if MI_SIZE_BITS==32
+  return __popcnt(x);
+  #else
+  return __popcnt64(x);
+  #endif
+#elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+#else
+  #define MI_HAS_FAST_POPCOUNT  0
+  error define generic popcount
+#endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
 /* --------------------------------------------------------------------------------
   find trailing/leading zero  (bit scan forward/reverse)
 -------------------------------------------------------------------------------- */
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index afdfe822..7d263d47 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -844,8 +844,10 @@ static inline mi_memid_t _mi_memid_none(void) {
   return _mi_memid_create(MI_MEM_NONE);
 }
 
-static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
+static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
   mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.mem.os.base = base;
+  memid.mem.os.size = size;
   memid.initially_committed = committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6f2f9c5f..dafd25f1 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -171,6 +171,7 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 typedef struct mi_memid_os_info {
   void*         base;               // actual base address of the block (used for offset aligned allocations)
   size_t        alignment;          // alignment at allocation
+  size_t        size;               // allocated full size
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
diff --git a/src/arena.c b/src/arena.c
index 194854a2..08b6c98d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -204,17 +204,19 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
     memid->initially_committed = true;
 
-    bool all_already_committed;
-    mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &all_already_committed);
-    if (!all_already_committed) {
+    size_t already_committed_count = 0;
+    mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count);
+    if (already_committed_count < slice_count) {
+      // recommit the full range
       bool commit_zero = false;
+      mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) {
         memid->initially_committed = false;
       }
       else {
         if (commit_zero) { memid->initially_zero = true; }
       }
-    }
+    }    
   }
   else {
     // no need to commit, but check if already fully committed
@@ -622,10 +624,6 @@ void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(mi_page_all_free(page));
   mi_assert_internal(page->next==NULL);
 
-  #if MI_STAT > 1
-  _mi_page_free_collect(page, true);  
-  #endif
-
   #if MI_DEBUG>1
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     size_t bin = _mi_bin(mi_page_block_size(page));
diff --git a/src/bitmap.c b/src/bitmap.c
index ed991441..c7c78dec 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -22,6 +22,11 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) {
   return mi_ctz(x);
 }
 
+
+static inline size_t mi_bfield_popcount(mi_bfield_t x) {
+  return mi_popcount(x);
+}
+
 //static inline size_t mi_bfield_clz(mi_bfield_t x) {
 //  return mi_clz(x);
 //}
@@ -70,26 +75,57 @@ static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, s
   }
 }
 
+// Set a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
+static inline bool mi_bfield_atomic_set2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_set) {
+  mi_assert_internal(idx < MI_BFIELD_BITS-1);
+  const size_t mask = (mi_bfield_t)0x03 << idx;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+  if (all_already_set!=NULL) { *all_already_set = ((old&mask)==mask); }
+  return ((old&mask) == 0);
+}
+
+// Clear a pair of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
+static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS-1);
+  const size_t mask = (mi_bfield_t)0x03 << idx;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+  if (all_already_clear!=NULL) { *all_already_clear = ((old&mask) == 0); }
+  return ((old&mask) == mask);
+}
+
+// Set/clear a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
+static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) {
+  if (set) {
+    return mi_bfield_atomic_set2(b, idx, already_xset);
+  }
+  else {
+    return mi_bfield_atomic_clear2(b, idx, already_xset);
+  }
+}
+
+
 // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
-static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_set) {
+static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
   while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
-  if (already_set!=NULL) { *already_set = ((old&mask)==mask); }
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
   return ((old&mask) == 0);
 }
 
 // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
-static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_clear) {
+static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
   while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
-  if (already_clear!=NULL) { *already_clear = ((old&mask)==0); }
+  if (already_clear!=NULL) { *already_clear = mi_bfield_popcount(~(old&mask)); }
   return ((old&mask) == mask);
 }
 
 // Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
+static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) {
   mi_assert_internal(mask != 0);
   if (set) {
     return mi_bfield_atomic_set_mask(b, mask, already_xset);
@@ -225,9 +261,8 @@ static inline bool mi_bitmap_chunk_xset2(mi_bit_t set, mi_bitmap_chunk_t* chunk,
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  mi_assert_internal((idx%2)==0);
-  const size_t mask = (mi_bfield_t)0x03 << idx;
-  return mi_bfield_atomic_xset_mask(set, &chunk->bfields[i], mask, all_already_xset);
+  mi_assert_internal((idx%2)==0);  
+  return mi_bfield_atomic_xset2(set, &chunk->bfields[i], idx, all_already_xset);
 }
 
 static inline bool mi_bitmap_chunk_set2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_set) {
@@ -241,11 +276,11 @@ static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx,
 
 // Set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
-static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* pall_already_xset) {
+static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
-  bool all_already_xset = true;
+  size_t all_already_xset = 0;
   size_t idx   = cidx % MI_BFIELD_BITS;
   size_t field = cidx / MI_BFIELD_BITS;
   while (n > 0) {
@@ -254,9 +289,9 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
     const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    bool already_xset = false;
+    size_t already_xset = 0;
     all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset );
-    all_already_xset = all_already_xset && already_xset;
+    all_already_xset += already_xset;
     // next field
     field++;
     idx = 0;
@@ -267,12 +302,12 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
 }
 
 
-static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_set) {
-  return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, all_allready_set);
+static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set);
 }
 
-static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_clear) {
-  return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, all_allready_clear);
+static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) {
+  return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear);
 }
 
 
@@ -829,7 +864,7 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset ) {
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
 
@@ -846,11 +881,11 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo
     // first set the anyset since it is a conservative approximation (increases epoch)
     mi_bitmap_anyset_set(bitmap, chunk_idx);
     // then actually try to set it atomically
-    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset);
+    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
   }
   else {
     const size_t epoch = mi_bitmap_epoch(bitmap);
-    bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset);
+    bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
     if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
       mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
     }
diff --git a/src/bitmap.h b/src/bitmap.h
index 62aab7a7..8c961fe1 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -65,10 +65,10 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
 // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset);
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
 
-static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_set) {
-  return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, all_already_set);
+static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
+  return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set);
 }
 
 static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
diff --git a/src/options.c b/src/options.c
index b69058cc..759d096d 100644
--- a/src/options.c
+++ b/src/options.c
@@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 0,   UNINIT, MI_OPTION(eager_abandon) },
+  { 1,   UNINIT, MI_OPTION(eager_abandon) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/os.c b/src/os.c
index 0aa0a681..bac59437 100644
--- a/src/os.c
+++ b/src/os.c
@@ -128,21 +128,24 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
+  if (still_committed) { 
+    _mi_stat_decrease(&stats->committed, size); 
+  }
   _mi_stat_decrease(&stats->reserved, size);
 }
 
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) {
   if (stats == NULL) stats = &_mi_stats_main;
   if (mi_memkind_is_os(memid.memkind)) {
-    size_t csize = _mi_os_good_alloc_size(size);
+    size_t csize = memid.mem.os.size;
+    if (csize==0) { _mi_os_good_alloc_size(size); }
     void* base = addr;
     // different base? (due to alignment)
-    if (memid.mem.os.base != NULL) {
+    if (memid.mem.os.base != base) {
       mi_assert(memid.mem.os.base <= addr);
       mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
       base = memid.mem.os.base;
-      csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base);
+      if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); }
     }
     // free it
     if (memid.memkind == MI_MEM_OS_HUGE) {
@@ -296,7 +299,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   bool os_is_zero  = false;
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
   if (p != NULL) {
-    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
   }
   return p;
 }
@@ -315,9 +318,10 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   void* os_base = NULL;
   void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
   if (p != NULL) {
-    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
     memid->mem.os.alignment = alignment;
+    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
   return p;
 }
@@ -642,7 +646,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
   if (page != 0) {
     mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(start, *psize, true /* is committed */, all_zero, true /* is_large */);
     memid->memkind = MI_MEM_OS_HUGE;
     mi_assert(memid->is_pinned);
     #ifdef MI_TRACK_ASAN
diff --git a/test/test-stress.c b/test/test-stress.c
index 61d1424a..487f7215 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 1
+#elif 0
 static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 50;

From bd5f7de3f416bb8a90d97d0ef1ae6b69ecebbe37 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 20:21:35 -0800
Subject: [PATCH 023/264] can run basic test

---
 src/arena.c        |  4 ++--
 src/bitmap.c       | 30 ++++++++++++++++++++++--------
 src/init.c         |  2 +-
 src/page-queue.c   | 30 +++++++++++++++++++++++++++++-
 src/page.c         |  9 ++++++---
 test/test-stress.c |  4 ++--
 6 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 08b6c98d..317a7e48 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -676,7 +676,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     // leave as is; it will be reclaimed when an object is free'd in the page
   }
   _mi_page_unown(page);
-  mi_stat_increase(_mi_stats_main.pages_abandoned, 1);
+  _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
 }
 
 // called from `mi_free` if trying to unabandon an abandoned page
@@ -706,7 +706,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     // nothing to do    
     // TODO: maintain count of these as well?
   }
-  mi_stat_decrease(_mi_stats_main.pages_abandoned, 1);
+  _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
 }
 
 /*
diff --git a/src/bitmap.c b/src/bitmap.c
index c7c78dec..eb5da086 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -453,6 +453,20 @@ static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t c
   return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n);
 }
 
+#if defined(__AVX2__)
+static inline __m256i mi_mm256_zero(void) {
+  return _mm256_setzero_si256();
+}
+static inline __m256i mi_mm256_ones(void) {
+  return _mm256_set1_epi64x(~0);
+}
+static inline bool mi_mm256_is_ones(__m256i vec) {
+  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
+}
+static inline bool mi_mm256_is_zero( __m256i vec) {
+  return _mm256_testz_si256(vec,vec);
+}
+#endif
 
 // find least 0/1-bit in a chunk and try to set/clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
@@ -461,7 +475,7 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu
 #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
     const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
     // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
     if (mask==0) return false;
@@ -483,11 +497,11 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu
     size_t chunk_idx = 0;
     #if 1
     __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    if ((set ? _mm256_test_all_ones(vec) : _mm256_testz_si256(vec,vec))) {
+    if ((set ? mi_mm256_is_ones(vec) : mi_mm256_is_zero(vec))) {
       chunk_idx += 4;
       vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
     }
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
     const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
     // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
     if (mask==0) return false;
@@ -496,7 +510,7 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu
     #else
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vec2  = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
-    const __m256i cmpv  = (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256());
+    const __m256i cmpv  = (set ? mi_mm256_ones() : mi_mm256_zero());
     const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
     const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
     const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
@@ -549,7 +563,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   while(true) {
     const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
+    const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1  : 0)
     const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
     if (mask == 0) return false;
     const size_t i = _tzcnt_u32(mask);
@@ -650,12 +664,12 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
 static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_testz_si256( vec, vec );
+  return mi_mm256_is_zero(vec);
   #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512)
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
-  if (!_mm256_testz_si256(vec1, vec1)) return false;
+  if (!mi_mm256_is_zero(vec1)) return false;
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
-  return (_mm256_testz_si256(vec2, vec2));
+  return (mi_mm256_is_zero(vec2));
   #else
   for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     if (chunk->bfields[i] != 0) return false;
diff --git a/src/init.c b/src/init.c
index 05ce54b4..d1670d02 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
   NULL,       // xheap
   NULL, NULL, // next, prev
   NULL,       // subproc
-  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
+  { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
diff --git a/src/page-queue.c b/src/page-queue.c
index 552e12c3..ad616b1d 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -260,6 +260,34 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   heap->page_count++;
 }
 
+static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+
+  page->prev = queue->last;
+  page->next = NULL;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = page;
+    queue->last = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  if (queue->first == page) {
+    mi_heap_queue_first_update(heap, queue);
+  }
+  heap->page_count++;
+}
+
 static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_queue_contains(queue, page));
@@ -344,7 +372,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
 
 static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
   // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`)
-  mi_page_queue_enqueue_from_ex(to, from, false /* enqueue at the end of the `to` queue? */, page);
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page);
 }
 
 // Only called from `mi_heap_absorb`.
diff --git a/src/page.c b/src/page.c
index 8cdfd6be..4d26dbad 100644
--- a/src/page.c
+++ b/src/page.c
@@ -274,7 +274,7 @@ void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page)
   mi_page_set_heap(page,heap);
   _mi_page_free_collect(page, false); // ensure used count is up to date
   mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  mi_page_queue_push(heap, pq, page);
+  mi_page_queue_push_at_end(heap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
 
@@ -807,8 +807,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
         page_candidate = page;
         candidate_count = 0;
       }
-      else if (/* !mi_page_is_expandable(page) && */ page->used >= page_candidate->used) {
-        if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); }
+      else if (mi_page_all_free(page_candidate)) { 
+        _mi_page_free(page_candidate, pq); 
+        page_candidate = page;
+      }
+      else if (page->used >= page_candidate->used) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
diff --git a/test/test-stress.c b/test/test-stress.c
index 487f7215..ffeb5dea 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -46,7 +46,7 @@ static int SCALE   = 100;
 static int ITER    = 50;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
-static int SCALE   = 25;      // scaling factor
+static int SCALE   = 50;      // scaling factor
 static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 #endif  
 
@@ -54,7 +54,7 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 
 #define STRESS                // undefine for leak test
 
-static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
+static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
 static bool   main_participates = false;       // main thread participates as a worker too

From 833b091ff9a54f42e110093031e1bc9fa204cc52 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 2 Dec 2024 20:25:44 -0800
Subject: [PATCH 024/264] can run the full test suite

---
 include/mimalloc/internal.h |  5 ++---
 src/free.c                  | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 7d263d47..cee88684 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -220,10 +220,9 @@ void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_att
 void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
-bool        _mi_free_delayed_block(mi_block_t* block);
-// void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
-void        _mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+// bool        _mi_free_delayed_block(mi_block_t* block);
+
 
 // "libc.c"
 #include    <stdarg.h>
diff --git a/src/free.c b/src/free.c
index 4ba6d6cc..4bce6886 100644
--- a/src/free.c
+++ b/src/free.c
@@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
 static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
 static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
-// static void   _mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
 
 // ------------------------------------------------------
@@ -33,7 +33,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  if (track_stats) { _mi_stat_free(page, block); }
+  if (track_stats) { mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
@@ -203,7 +203,7 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
   // adjust stats (after padding check and potentially recursive `mi_free` above)
-  _mi_stat_free(page, block);    // stat_free may access the padding
+  mi_stat_free(page, block);    // stat_free may access the padding
   mi_track_free_size(block, mi_page_usable_size_of(page, block));
 
   // _mi_padding_shrink(page, block, sizeof(mi_block_t));
@@ -543,7 +543,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
-void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   #if (MI_STAT < 2)
   MI_UNUSED(block);
   #endif
@@ -565,7 +565,7 @@ void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   }
 }
 #else
-void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif

From 666c089fc85b67c0773e502856f5b9fb179164cd Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 3 Dec 2024 10:51:13 -0800
Subject: [PATCH 025/264] revise free reclaim; ensure unown cannot race with a
 free

---
 include/mimalloc/internal.h |  88 ++++++++++++++------
 include/mimalloc/types.h    |   4 +
 src/arena.c                 |  71 +++++++++++-----
 src/bitmap.c                |  16 ++--
 src/free.c                  | 156 +++++++++++++++++++++++++++++++-----
 src/init.c                  |   2 +-
 src/options.c               |   2 +-
 src/page.c                  |   2 +-
 src/stats.c                 |   4 +
 test/test-stress.c          |  15 +++-
 10 files changed, 281 insertions(+), 79 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index cee88684..56172bcd 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -143,7 +143,8 @@ void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
 void       _mi_arena_page_free(mi_page_t* page);
 void       _mi_arena_page_abandon(mi_page_t* page);
-void       _mi_arena_page_unabandon(mi_page_t* page); 
+void       _mi_arena_page_unabandon(mi_page_t* page);
+bool       _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page);
 
 bool       _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page);
 void       _mi_arena_reclaim_all_abandoned(mi_heap_t* heap);
@@ -572,29 +573,6 @@ static inline bool mi_page_is_owned(const mi_page_t* page) {
   return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
 }
 
-// Unown a page that is currently owned
-static inline void _mi_page_unown(mi_page_t* page) {
-  mi_assert_internal(mi_page_is_owned(page));
-  mi_assert_internal(mi_page_thread_id(page)==0);
-  const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1));
-  mi_assert_internal((old&1)==1); MI_UNUSED(old);
-  /*
-  mi_thread_free_t tf_new;
-  mi_thread_free_t tf_old;
-  do {
-    tf_old = mi_atomic_load_relaxed(&page->xthread_free);
-    mi_assert_internal(mi_tf_is_owned(tf_old));
-    tf_new = mi_tf_create(mi_tf_block(tf_old), false);
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
-  */
-}
-
-// get ownership if it is not yet owned
-static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
-  const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1);
-  return ((old&1)==0);
-}
-
 
 //static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
 //  return mi_tf_make(mi_tf_block(tf),delayed);
@@ -638,7 +616,7 @@ static inline bool mi_page_is_full(mi_page_t* page) {
 }
 
 // is more than 7/8th of a page in use?
-static inline bool mi_page_mostly_used(const mi_page_t* page) {
+static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / 8U;
   return (page->reserved - page->used <= frac);
@@ -646,9 +624,22 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_atomic_load_acquire(&page->xthread_id) == 0);
+  return (mi_atomic_load_acquire(&page->xthread_id) <= 1);
 }
 
+static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
+  return (mi_atomic_load_acquire(&page->xthread_id) == 1);
+}
+
+static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
+  mi_atomic_or_acq_rel(&page->xthread_id, (uintptr_t)1);
+}
+
+static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
+  mi_atomic_and_acq_rel(&page->xthread_id, ~(uintptr_t)1);
+}
+
+
 static inline bool mi_page_is_huge(const mi_page_t* page) {
   return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
 }
@@ -659,6 +650,51 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
 }
 
 
+// Unown a page that is currently owned
+static inline void _mi_page_unown_unconditional(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_thread_id(page)==0);
+  const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1));
+  mi_assert_internal((old&1)==1); MI_UNUSED(old);
+  /*
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old;
+  do {
+    tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+    mi_assert_internal(mi_tf_is_owned(tf_old));
+    tf_new = mi_tf_create(mi_tf_block(tf_old), false);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
+  */
+}
+
+
+// get ownership if it is not yet owned
+static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
+  const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1);
+  return ((old&1)==0);
+}
+
+static inline void _mi_page_unown(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_thread_id(page)==0);
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_assert_internal(mi_tf_is_owned(tf_old));
+    while mi_unlikely(mi_tf_block(tf_old) != NULL) {
+      _mi_page_free_collect(page, false);  // update used
+      if (mi_page_all_free(page)) {        // it may become free just before unowning it
+        _mi_arena_page_unabandon(page);
+        _mi_arena_page_free(page);
+        return;
+      }
+      tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    mi_assert_internal(mi_tf_block(tf_old)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
+}
 
 //-----------------------------------------------------------
 // Page flags
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index dafd25f1..4430cd6c 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -505,6 +505,10 @@ typedef struct mi_stats_s {
   mi_stat_count_t giant;
   mi_stat_count_t malloc;
   mi_stat_counter_t pages_extended;
+  mi_stat_counter_t pages_reclaim_on_alloc;
+  mi_stat_counter_t pages_reclaim_on_free;
+  mi_stat_counter_t pages_reabandon_full;
+  mi_stat_counter_t pages_unabandon_busy_wait;
   mi_stat_counter_t mmap_calls;
   mi_stat_counter_t commit_calls;
   mi_stat_counter_t reset_calls;
diff --git a/src/arena.c b/src/arena.c
index 317a7e48..a2343674 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,7 +42,7 @@ typedef struct mi_arena_s {
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices should be decommitted from `slices_decommit`.
-  
+
   mi_bitmap_t         slices_free;          // is the slice free?
   mi_bitmap_t         slices_committed;     // is the slice committed? (i.e. accessible)
   mi_bitmap_t         slices_purge;         // can the slice be purged? (slice in purge => slice in free)
@@ -216,7 +216,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       else {
         if (commit_zero) { memid->initially_zero = true; }
       }
-    }    
+    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -355,7 +355,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
-  
+
   // try to find free slices in the arena's
   void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
   if (p != NULL) return p;
@@ -457,7 +457,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       // try to claim ownership atomically
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
       if (!mi_page_try_claim_ownership(page)) {
-        // a concurrent free already grabbed the page. 
+        // a concurrent free already grabbed the page.
         // Restore the abandoned_map to make it available again (unblocking busy waiters)
         mi_pairmap_set(pairmap, slice_index);
       }
@@ -465,6 +465,9 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
         // we got ownership, clear the abandoned entry (unblocking busy waiters)
         mi_pairmap_clear(pairmap, slice_index);
         mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
+        _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+        _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+
         _mi_page_free_collect(page, false);  // update `used` count
         mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
         mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
@@ -472,7 +475,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
         mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
         mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
         mi_assert_internal(_mi_ptr_page(page)==page);
-        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); 
+        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
         mi_assert_internal(mi_page_block_size(page) == block_size);
         mi_assert_internal(mi_page_is_abandoned(page));
         mi_assert_internal(mi_page_is_owned(page));
@@ -492,11 +495,11 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   const bool commit = true;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
-  
+
   // try to allocate from free space in arena's
   mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
-  if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?       
+  if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
@@ -575,16 +578,16 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
   const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
-  
+
   mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);
   if (page == NULL) return NULL;
-  
+
   mi_assert(page != NULL);
-  mi_assert(page->reserved == 1);  
+  mi_assert(page->reserved == 1);
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
 
-  return page;  
+  return page;
 }
 
 
@@ -646,17 +649,17 @@ void _mi_arena_page_free(mi_page_t* page) {
   Arena abandon
 ----------------------------------------------------------- */
 
-void _mi_arena_page_abandon(mi_page_t* page) {
+static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(!mi_page_all_free(page));
   mi_assert_internal(page->next==NULL);
-  
+
   mi_subproc_t* subproc = page->subproc;
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
-    // make available for allocations    
+    // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
     size_t slice_index;
     size_t slice_count;
@@ -667,6 +670,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
     // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
 
+    mi_page_set_abandoned_mapped(page);
     bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(were_zero); mi_assert_internal(were_zero);
     mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]);
@@ -676,34 +680,59 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     // leave as is; it will be reclaimed when an object is free'd in the page
   }
   _mi_page_unown(page);
+}
+
+void _mi_arena_page_abandon(mi_page_t* page) {
+  mi_arena_page_abandon_no_stat(page);
   _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
 }
 
+bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_is_abandoned_mapped(page));
+  mi_assert_internal(!mi_page_is_full(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(!mi_page_is_singleton(page));
+  if (mi_page_is_full(page) || mi_page_is_abandoned_mapped(page) || page->memid.memkind != MI_MEM_ARENA) {
+    return false;
+  }
+  else {
+    _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
+    mi_arena_page_abandon_no_stat(page);
+    return true;
+  }
+}
+
 // called from `mi_free` if trying to unabandon an abandoned page
 void _mi_arena_page_unabandon(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
-  
-  if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
+
+  if (mi_page_is_abandoned_mapped(page)) {
+    mi_assert_internal(page->memid.memkind==MI_MEM_ARENA);
     // remove from the abandoned map
     size_t bin = _mi_bin(mi_page_block_size(page));
     size_t slice_index;
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
-    
+
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index);
+    mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
-    // nothing to do    
+    // nothing to do
     // TODO: maintain count of these as well?
   }
   _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
@@ -715,7 +744,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned (anymore)
-   
+
   // note: we can access the page even it is in the meantime reclaimed by another thread since
   // we only call this when on free (and thus there is still an object alive in the page)
   mi_memid_t memid = page->memid;
@@ -967,7 +996,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   mi_lock_init(&arena->abandoned_visit_lock);
-  
+
   // init bitmaps
   mi_bitmap_init(&arena->slices_free,true);
   mi_bitmap_init(&arena->slices_committed,true);
@@ -1068,7 +1097,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
         _mi_memset(buf + k, 'o', MI_BFIELD_BITS);
         k += MI_BFIELD_BITS;
       }
-      bit_count += MI_BFIELD_BITS;      
+      bit_count += MI_BFIELD_BITS;
     }
     _mi_output_message("%s  %s\n", prefix, buf);
   }
diff --git a/src/bitmap.c b/src/bitmap.c
index eb5da086..df25e028 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -80,7 +80,7 @@ static inline bool mi_bfield_atomic_set2(_Atomic(mi_bfield_t)*b, size_t idx, boo
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
   const size_t mask = (mi_bfield_t)0x03 << idx;
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { };  // try to atomically set the mask bits until success
   if (all_already_set!=NULL) { *all_already_set = ((old&mask)==mask); }
   return ((old&mask) == 0);
 }
@@ -90,7 +90,7 @@ static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, b
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
   const size_t mask = (mi_bfield_t)0x03 << idx;
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { };  // try to atomically clear the mask bits until success
   if (all_already_clear!=NULL) { *all_already_clear = ((old&mask) == 0); }
   return ((old&mask) == mask);
 }
@@ -110,7 +110,7 @@ static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b,
 static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { };  // try to atomically set the mask bits until success
   if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
   return ((old&mask) == 0);
 }
@@ -119,7 +119,7 @@ static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t
 static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { };  // try to atomically clear the mask bits until success
   if (already_clear!=NULL) { *already_clear = mi_bfield_popcount(~(old&mask)); }
   return ((old&mask) == mask);
 }
@@ -1115,16 +1115,18 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b,
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
   const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
-  mi_bfield_t old;
   mi_bfield_t bnew;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
   do {
-    old = mi_atomic_load_relaxed(b);
     if mi_unlikely((old&mask)==mask_busy) {
       old = mi_atomic_load_acquire(b);
+      if ((old&mask)==mask_busy) {
+        _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1);
+      }
       while ((old&mask)==mask_busy) {  // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
-      }
+      }      
     }
     bnew = (old & ~mask);  // clear
   } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew));
diff --git a/src/free.c b/src/free.c
index 4bce6886..6e8514c6 100644
--- a/src/free.c
+++ b/src/free.c
@@ -128,7 +128,7 @@ void mi_free(void* p) mi_attr_noexcept
 {
   mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
   if mi_unlikely(page==NULL) return;
-  
+
   const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
@@ -156,50 +156,164 @@ void mi_free(void* p) mi_attr_noexcept
 static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_thread_id(page)==0);
-
+#if 1
   // we own the page now..
-  
-  // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-  _mi_arena_page_unabandon(page);  // this must be before collect
-
-  // collect the thread atomic free list
+  // safe to collect the thread atomic free list
   _mi_page_free_collect(page, false);  // update `used` count
+  #if MI_DEBUG > 1
   if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+  #endif
 
-  if (mi_page_all_free(page)) {
+  // 1. free if the page is free now
+  if (mi_page_all_free(page)) 
+  {
+    // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
+    _mi_arena_page_unabandon(page); 
     // we can free the page directly
     _mi_arena_page_free(page);
     return;
   }
-  else {
-    // the page has still some blocks in use
+  // 2. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  else if (!mi_page_is_mostly_used(page) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+           !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
+           _mi_arena_page_try_reabandon_to_mapped(page)) 
+  {
+    return;
+  }
+  // 3. if the page is not too full, we can try to reclaim it for ourselves
+  else if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && 
+           !mi_page_is_mostly_used(page))
+  {
+    // the page has still some blocks in use (but not too many)
     // reclaim in our heap if compatible, or otherwise abandon again
     // todo: optimize this check further?
     // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
     // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
     mi_heap_t* const heap = mi_prim_get_default_heap();
-
-    if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed
-        (heap != (mi_heap_t*)&_mi_heap_empty))       // we did not already terminate our thread (can this happen? 
+    if (heap != (mi_heap_t*)&_mi_heap_empty)  // we did not already terminate our thread (can this happen?
     {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-      if ((tagheap != NULL) &&                         // don't reclaim across heap object types       
+      if ((tagheap != NULL) &&                       // don't reclaim across heap object types
           (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-          (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)        
+          (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
       {
-        // make it part of our heap
+        // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+        _mi_arena_page_unabandon(page);
         _mi_heap_page_reclaim(tagheap, page);
+        _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
         return;
-      }      
+      }
+    }
+  }
+
+  // not reclaimed or free'd, unown again
+  _mi_page_unown(page);
+
+#else
+  if (!mi_page_is_abandoned_mapped(page)) {
+    // singleton or OS allocated
+    if (mi_page_is_singleton(page)) {
+      // free singleton pages
+      #if MI_DEBUG>1
+      _mi_page_free_collect(page, false);  // update `used` count
+      mi_assert_internal(mi_page_all_free(page));
+      #endif
+      // we can free the page directly
+      _mi_arena_page_free(page);
+      return;
+    }
+    else {
+      const bool was_full = mi_page_is_full(page);
+      _mi_page_free_collect(page,false); // update used
+      if (mi_page_all_free(page)) {
+        // no need to unabandon as it is unmapped
+        _mi_arena_page_free(page);
+        return;
+      }
+      else if (was_full && _mi_arena_page_reabandon_full(page)) {
+        return;
+      }
+      else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) {
+        // the page has still some blocks in use (but not too many)
+        // reclaim in our heap if compatible, or otherwise abandon again
+        // todo: optimize this check further?
+        // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
+        // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+        mi_heap_t* const heap = mi_prim_get_default_heap();
+        if (heap != (mi_heap_t*)&_mi_heap_empty) {       // we did not already terminate our thread (can this happen?
+          mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
+          if ((tagheap != NULL) &&                         // don't reclaim across heap object types
+              (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
+              (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+              )
+          {
+            _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+            // make it part of our heap (no need to unabandon as is unmapped)
+            _mi_heap_page_reclaim(tagheap, page);
+            return;
+          }
+        }
+      }
+    }
+  }
+  else {
+    // don't reclaim pages that can be found for fresh page allocations
+  }
+
+  // not reclaimed or free'd, unown again
+  _mi_page_unown(page);
+#endif
+}
+
+/*
+// we own the page now..
+// safe to collect the thread atomic free list
+_mi_page_free_collect(page, false);  // update `used` count
+if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+
+if (mi_page_all_free(page)) {
+  // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+  _mi_arena_page_unabandon(page);  // this must be before free'ing
+  // we can free the page directly
+  _mi_arena_page_free(page);
+  return;
+}
+else if (!mi_page_is_mostly_used(page)) {
+  // the page has still some blocks in use (but not too many)
+  // reclaim in our heap if compatible, or otherwise abandon again
+  // todo: optimize this check further?
+  // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
+  // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+  mi_heap_t* const heap = mi_prim_get_default_heap();
+
+  if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed
+      (heap != (mi_heap_t*)&_mi_heap_empty))       // we did not already terminate our thread (can this happen?
+  {
+    mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
+    if ((tagheap != NULL) &&                         // don't reclaim across heap object types
+        (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
+        (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+        )
+    {
+      // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+      _mi_arena_page_unabandon(page);
+      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+      // make it part of our heap
+      _mi_heap_page_reclaim(tagheap, page);
+      return;
     }
-        
-    // we cannot reclaim this page.. abandon it again
-    _mi_arena_page_abandon(page);
   }
 }
 
-// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. 
+// we cannot reclaim this page.. leave it abandoned
+// todo: should re-abandon or otherwise a partly used page could never be re-used if the
+// objects in it are not freed explicitly.
+_mi_page_unown(page);
+*/
+
+
+// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
   // adjust stats (after padding check and potentially recursive `mi_free` above)
diff --git a/src/init.c b/src/init.c
index d1670d02..01beb222 100644
--- a/src/init.c
+++ b/src/init.c
@@ -83,7 +83,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/options.c b/src/options.c
index 759d096d..1b326cc3 100644
--- a/src/options.c
+++ b/src/options.c
@@ -143,7 +143,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
diff --git a/src/page.c b/src/page.c
index 4d26dbad..9ea7a979 100644
--- a/src/page.c
+++ b/src/page.c
@@ -811,7 +811,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
         _mi_page_free(page_candidate, pq); 
         page_candidate = page;
       }
-      else if (page->used >= page_candidate->used) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
diff --git a/src/stats.c b/src/stats.c
index 53b18da0..2a793b59 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -331,6 +331,10 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
   mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, "");
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "-reclaima", out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_free,  "-reclaimf", out, arg);
+  mi_stat_counter_print(&stats->pages_reabandon_full, "-reabandon", out, arg);
+  mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "-waits", out, arg);
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
   mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
diff --git a/test/test-stress.c b/test/test-stress.c
index ffeb5dea..4c2719aa 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -43,7 +43,13 @@ static int ITER    = 10;
 #elif 0
 static int THREADS = 4;
 static int SCALE   = 100;
+static int ITER    = 10;
+#define ALLOW_LARGE false
+#elif 1
+static int THREADS = 32;
+static int SCALE   = 50;
 static int ITER    = 50;
+#define ALLOW_LARGE false
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 50;      // scaling factor
@@ -54,7 +60,12 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 
 #define STRESS                // undefine for leak test
 
-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+#ifndef ALLOW_LARGE
+#define ALLOW_LARGE  true
+#endif
+
+static bool   allow_large_objects = ALLOW_LARGE;    // allow very large objects? (set to `true` if SCALE>100)
+
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
 static bool   main_participates = false;       // main thread participates as a worker too
@@ -332,6 +343,8 @@ int main(int argc, char** argv) {
   mi_debug_show_arenas(true,true,false);
   #endif
   // mi_stats_print(NULL);
+#else
+  mi_stats_print(NULL);  // so we see rss/commit/elapsed
 #endif
   //bench_end_program();
   return 0;

From 3fc2c8e279bc7d0ba18378ec1f525adff8672a87 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 3 Dec 2024 11:06:07 -0800
Subject: [PATCH 026/264] fix assertions

---
 include/mimalloc/internal.h | 3 +--
 src/bitmap.c                | 4 +---
 src/free.c                  | 2 +-
 src/init.c                  | 3 ++-
 src/page.c                  | 2 +-
 test/test-stress.c          | 4 ++--
 6 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 56172bcd..9fa27f31 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -676,8 +676,7 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
 
 static inline void _mi_page_unown(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
-  mi_assert_internal(mi_page_is_abandoned(page));
-  mi_assert_internal(mi_page_thread_id(page)==0);
+  mi_assert_internal(mi_page_is_abandoned(page));  
   mi_thread_free_t tf_new;
   mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
   do {
diff --git a/src/bitmap.c b/src/bitmap.c
index df25e028..4eadce80 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1120,9 +1120,7 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b,
   do {
     if mi_unlikely((old&mask)==mask_busy) {
       old = mi_atomic_load_acquire(b);
-      if ((old&mask)==mask_busy) {
-        _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1);
-      }
+      if ((old&mask)==mask_busy) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
       while ((old&mask)==mask_busy) {  // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
diff --git a/src/free.c b/src/free.c
index 6e8514c6..70ef5d8a 100644
--- a/src/free.c
+++ b/src/free.c
@@ -155,7 +155,7 @@ void mi_free(void* p) mi_attr_noexcept
 
 static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
-  mi_assert_internal(mi_page_thread_id(page)==0);
+  mi_assert_internal(mi_page_is_abandoned(page));
 #if 1
   // we own the page now..
   // safe to collect the thread atomic free list
diff --git a/src/init.c b/src/init.c
index 01beb222..99a5ea39 100644
--- a/src/init.c
+++ b/src/init.c
@@ -83,7 +83,8 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/page.c b/src/page.c
index 9ea7a979..e5e3f972 100644
--- a/src/page.c
+++ b/src/page.c
@@ -811,7 +811,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
         _mi_page_free(page_candidate, pq); 
         page_candidate = page;
       }
-      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) {
+      else if (page->used >= page_candidate->used)  { // && !mi_page_is_mostly_used(page)) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
diff --git a/test/test-stress.c b/test/test-stress.c
index 4c2719aa..9e53e920 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -45,14 +45,14 @@ static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;
 #define ALLOW_LARGE false
-#elif 1
+#elif 0
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
 #define ALLOW_LARGE false
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
-static int SCALE   = 50;      // scaling factor
+static int SCALE   = 25;      // scaling factor
 static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 #endif  
 

From 8d9c725482537a811b4eb9c982bfbfdf7680cbc1 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 3 Dec 2024 17:27:43 -0800
Subject: [PATCH 027/264] increase MAX_OBJ_SLICES to a full chunk (32MiB)

---
 include/mimalloc/internal.h |  15 +++
 include/mimalloc/types.h    |   3 +-
 src/arena.c                 |  65 ++++++++++---
 src/bitmap.c                | 185 +++++++++++++++++++++++++++---------
 src/bitmap.h                |  47 ++++-----
 src/os.c                    |  15 ---
 src/page-map.c              |   2 +-
 7 files changed, 230 insertions(+), 102 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 9fa27f31..34dbab07 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -339,6 +339,21 @@ static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) {
 }
 
 
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+static inline void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   mi_assert_internal(divider != 0);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 4430cd6c..3d83e27a 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -128,8 +128,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 
 #define MI_ARENA_MIN_OBJ_SLICES           (1)
-#define MI_ARENA_MAX_OBJ_SLICES           (MI_SIZE_BITS)              // for now, cannot cross bit field boundaries.. todo: make it at least MI_BITMAP_CHUNK_BITS ? (16 MiB)
-// #define MI_ARENA_MAX_OBJ_BLOCKS        (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BITMAP_CHUNK_BITS)      // 32 MiB (for now, cannot cross chunk boundaries)
 
 #define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
 #define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
diff --git a/src/arena.c b/src/arena.c
index a2343674..1b891377 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -193,30 +193,55 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   void* p = mi_arena_slice_start(arena, slice_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
-
+  
   // set the dirty bits
   if (arena->memid.initially_zero) {
+    // size_t dirty_count = 0;
     memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL);
+    //if (dirty_count>0) {
+    //  if (memid->initially_zero) {
+    //    _mi_error_message(EFAULT, "ouch1\n");
+    //  }
+    //  // memid->initially_zero = false;
+    //}
+    //else {
+    //  if (!memid->initially_zero) {
+    //    _mi_error_message(EFAULT, "ouch2\n");
+    //  }
+    //  // memid->initially_zero = true;
+    //}
   }
 
   // set commit state
   if (commit) {
-    // commit requested, but the range may not be committed as a whole: ensure it is committed now
     memid->initially_committed = true;
 
-    size_t already_committed_count = 0;
-    mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count);
-    if (already_committed_count < slice_count) {
-      // recommit the full range
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    if (!mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)) {
+      // not fully committed: commit the full range and set the commit bits
+      // (this may race and we may double-commit which is fine)
       bool commit_zero = false;
-      mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) {
         memid->initially_committed = false;
       }
       else {
         if (commit_zero) { memid->initially_zero = true; }
+        #if MI_DEBUG > 1
+        if (memid->initially_zero) {
+          if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) {
+            _mi_error_message(EFAULT, "arena allocation was not zero-initialized!\n");
+            memid->initially_zero = false;
+          }
+        }
+        #endif  
+        size_t already_committed_count = 0;
+        mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count);
+        if (already_committed_count < slice_count) {
+          // todo: also decrease total
+          mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+        }
       }
-    }
+    }    
   }
   else {
     // no need to commit, but check if already fully committed
@@ -523,7 +548,18 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
   // claimed free slices: initialize the page partly
-  if (!memid.initially_zero) { _mi_memzero_aligned(page, sizeof(*page)); }
+  if (!memid.initially_zero) { 
+    _mi_memzero_aligned(page, sizeof(*page)); 
+  }
+  #if MI_DEBUG > 1
+  else {
+    if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) {
+      _mi_error_message(EFAULT, "page memory was not zero initialized!\n");
+      memid.initially_zero = false;
+      _mi_memzero_aligned(page, sizeof(*page));
+    }
+  }
+  #endif
   mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
   const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
@@ -668,7 +704,7 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
-    // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
     bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index);
@@ -851,6 +887,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       mi_assert_internal(all_committed);
     }
     else {
+      /*
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
         mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count);
@@ -864,6 +901,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // that contains already decommitted parts. Since purge consistently uses reset or decommit that
         // works (as we should never reset decommitted parts).
       }
+      */
       // (delay) purge the entire range
       mi_arena_schedule_purge(arena, slice_index, slice_count, stats);
     }
@@ -1014,7 +1052,12 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   else {
     mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL);
   }
-  mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL);
+  if (!memid.initially_zero) {
+    mi_bitmap_unsafe_setN(&arena->slices_dirty, 0, arena->slice_count);
+  }
+  else {
+    mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL);
+  }
 
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 }
diff --git a/src/bitmap.c b/src/bitmap.c
index 4eadce80..a6c9e879 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -42,6 +42,25 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
   return mi_rotr(x,r);
 }
 
+static inline mi_bfield_t mi_bfield_zero(void) {
+  return 0;
+}
+
+static inline mi_bfield_t mi_bfield_one(void) {
+  return 1;
+}
+
+static inline mi_bfield_t mi_bfield_all_set(void) {
+  return ~((mi_bfield_t)0);
+}
+
+static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
+  mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
+  const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set());
+  return (mask0 << shiftl);
+}
+
+
 // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
 // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
@@ -52,7 +71,7 @@ static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, siz
 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
   const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask);
   return ((old&mask) == 0);
 }
@@ -60,7 +79,7 @@ static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
 // Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
 static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
   mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
   return ((old&mask) == mask);
 }
@@ -105,7 +124,6 @@ static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b,
   }
 }
 
-
 // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
 static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
   mi_assert_internal(mask != 0);
@@ -216,6 +234,21 @@ static inline bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)
   return mi_bfield_atomic_try_xset_mask(set, b, mask);
 }
 
+// Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's.
+// and false otherwise leaving the bit field as-is.
+static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) {
+  mi_bfield_t old = 0;
+  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_all_set());
+}
+
+// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
+// and false otherwise leaving the bit field as-is.
+static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
+  mi_bfield_t old = mi_bfield_all_set();
+  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero());
+}
+
+
 
 // Check if all bits corresponding to a mask are set.
 static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
@@ -245,7 +278,7 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield
 // Check if a bit is set/clear
 // static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
 //   mi_assert_internal(idx < MI_BFIELD_BITS);
-//   const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+//   const mi_bfield_t mask = mi_bfield_one()<<idx;
 //   return mi_bfield_atomic_is_xset_mask(set, b, mask);
 // }
 
@@ -288,9 +321,13 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t
     if (m > n) { m = n; }
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    const mi_bfield_t mask = mi_bfield_mask(m, idx);
     size_t already_xset = 0;
-    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset );
+    const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
+    if (already_xset > 0 && transition) {
+      _mi_error_message(EFAULT, "ouch\n");
+    }
+    all_transition = all_transition && transition;
     all_already_xset += already_xset;
     // next field
     field++;
@@ -335,7 +372,6 @@ static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t ci
 static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
-  bool all_xset = true;
   size_t idx = cidx % MI_BFIELD_BITS;
   size_t field = cidx / MI_BFIELD_BITS;
   while (n > 0) {
@@ -343,14 +379,16 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz
     if (m > n) { m = n; }
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
+    const size_t mask = mi_bfield_mask(m, idx);
+    if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) {
+      return false;
+    }
     // next field
     field++;
     idx = 0;
     n -= m;
   }
-  return all_xset;
+  return true;
 }
 
 
@@ -389,14 +427,14 @@ static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t b
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
 static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   if (n==0) return true;
   size_t start_idx = cidx % MI_BFIELD_BITS;
   size_t start_field = cidx / MI_BFIELD_BITS;
   size_t end_field = MI_BITMAP_CHUNK_FIELDS;
-  size_t mask_mid = 0;
-  size_t mask_end = 0;
+  mi_bfield_t mask_mid = 0;
+  mi_bfield_t mask_end = 0;
 
   // first field
   size_t field = start_field;
@@ -404,7 +442,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si
   if (m > n) { m = n; }
   mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
   mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
-  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
+  const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);    
   if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false;
 
   // done?
@@ -417,7 +455,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si
   while (n >= MI_BFIELD_BITS) {
     field++;
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    mask_mid = ~MI_ZU(0);
+    mask_mid = mi_bfield_all_set();
     if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
     n -= MI_BFIELD_BITS;
   }
@@ -428,7 +466,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si
     field++;
     mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
     end_field = field;
-    mask_end = (MI_ZU(1)<<n)-1;
+    mask_end = mi_bfield_mask(n, 0);
     if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
   }
 
@@ -602,14 +640,12 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
 }
 
 
-// find a sequence of `n` bits in a chunk with all `n` (`< MI_BFIELD_BITS`!) bits set,
-// and try unset it atomically
+// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
+// and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-// todo: try avx2 and neon version
-// todo: allow spanning across bfield boundaries?
-static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
-  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
+static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;  
+  const mi_bfield_t mask = mi_bfield_mask(n, 0);
   for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     mi_bfield_t b = chunk->bfields[i];
     size_t bshift = 0;
@@ -636,8 +672,48 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk,
         // advance
         const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
         mi_assert_internal(ones>0);
-        bshift += ones;
         b >>= ones;
+        bshift += ones;
+      }
+    }
+  }
+  return false;
+}
+
+// find a sequence of `n` bits in a chunk with `n < MI_BITMAP_CHUNK_BITS` with all bits set,
+// and try to clear them atomically.
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
+static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
+
+  // we align an a field, and require `field_count` fields to be all clear.
+  // n >= MI_BFIELD_BITS; find a first field that is 0
+  const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
+  for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) 
+  {
+    // first pre-scan for a range of fields that are all set
+    bool allset = true;
+    size_t j = 0;
+    do {
+      mi_assert_internal(i + j < MI_BITMAP_CHUNK_FIELDS);
+      mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
+      if (~b != 0) {
+        allset = false;
+        i += j;  // no need to look again at the previous fields
+        break;
+      }
+    } while (++j < field_count);
+
+    // if all set, we can try to atomically clear them
+    if (allset) {
+      const size_t cidx = i*MI_BFIELD_BITS;
+      if (mi_bitmap_chunk_try_clearN(chunk, cidx, n)) {
+        // we cleared all atomically
+        *pidx = cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+        return true;
       }
     }
   }
@@ -796,7 +872,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 
 // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
@@ -816,12 +892,9 @@ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   }
 }
 
-
-
-
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
   mi_assert_internal(idx%8 == 0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
@@ -846,13 +919,12 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+static bool mi_bitmap_try_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
-  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
-
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);  
   mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  if (n==0 || idx + n > MI_BITMAP_MAX_BITS) return false;
+
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
@@ -875,13 +947,21 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   }
 }
 
+bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
+  if (n==1) return mi_bitmap_try_xset(set, bitmap, idx);
+  if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx);
+  return mi_bitmap_try_xsetN_(set, bitmap, idx, n);
+}
+
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
+static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
 
+  //TODO: specialize?
   //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
   //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
 
@@ -899,14 +979,26 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, si
   }
   else {
     const size_t epoch = mi_bitmap_epoch(bitmap);
-    bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
-    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+    size_t already_clear = 0;
+    const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear);
+    if (already_xset != NULL) { *already_xset = already_clear; }
+    if (already_clear < n && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
       mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
     }
-    return cleared;
+    return allset;
   }
 }
 
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
+  mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
+  //TODO: specialize?
+  //if (n==1) return mi_bitmap_xset(set, bitmap, idx);
+  //if (n==8) return mi_bitmap_xset8(set, bitmap, idx);
+  return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset);
+}
+
 
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
@@ -949,7 +1041,7 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
   mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx)
   {
     size_t cidx;
@@ -973,7 +1065,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
+mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
   mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx)
   {
     size_t cidx;
@@ -997,10 +1089,9 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
-  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
-  // TODO: allow spanning across chunk boundaries
-  if (n == 0 || n > MI_BFIELD_BITS) return false;
+mi_decl_nodiscard static bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
+  // TODO: allow spanning across chunk boundaries?
+  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
   mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx)
   {
     size_t cidx;
@@ -1021,6 +1112,12 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
   return false;
 }
 
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);
+  if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);
+  return mi_bitmap_try_find_and_clearN_(bitmap, n, tseq, pidx);
+}
+
 
 /* --------------------------------------------------------------------------------
   pairmap epochset
diff --git a/src/bitmap.h b/src/bitmap.h
index 8c961fe1..948bd1e3 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -90,28 +90,28 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
 // and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
-}
-
-static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx);
-}
+//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+//
+//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
+//  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
+//}
+//
+//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
+//  return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx);
+//}
 
 
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
-}
-
-static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
-}
+//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+//
+//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
+//  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
+//}
+//
+//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) {
+//  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
+//}
 
 // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
@@ -126,17 +126,6 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t
   return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
 }
 
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx );
-
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
diff --git a/src/os.c b/src/os.c
index bac59437..c7f464c0 100644
--- a/src/os.c
+++ b/src/os.c
@@ -92,21 +92,6 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}
-
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
-}
-
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
diff --git a/src/page-map.c b/src/page-map.c
index 15578301..0e99a890 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -24,7 +24,7 @@ static bool mi_page_map_init(void) {
 
   mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
 
-  mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
+  mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);

From e5fdd6e110471b6665ee388366c7aa493c2a7557 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 3 Dec 2024 22:43:14 -0800
Subject: [PATCH 028/264] wip: initial large bitmaps

---
 src/arena.c        | 162 ++++++-----
 src/bitmap.c       | 666 +++++++++++++++++++++++----------------------
 src/bitmap.h       | 108 +++++---
 src/page-map.c     |   3 +-
 test/test-stress.c |   2 +-
 5 files changed, 501 insertions(+), 440 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 1b891377..f8b6fca1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -37,18 +37,20 @@ typedef struct mi_arena_s {
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
 
   size_t              slice_count;          // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
+  size_t              info_slices;          // initial slices reserved for the arena bitmaps
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices should be decommitted from `slices_decommit`.
 
-  mi_bitmap_t         slices_free;          // is the slice free?
-  mi_bitmap_t         slices_committed;     // is the slice committed? (i.e. accessible)
-  mi_bitmap_t         slices_purge;         // can the slice be purged? (slice in purge => slice in free)
-  mi_bitmap_t         slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t*        slices_free;          // is the slice free?
+  mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
+  mi_bitmap_t*        slices_purge;         // can the slice be purged? (slice in purge => slice in free)
+  mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
   mi_pairmap_t        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
+  // followed by the bitmaps (whose size depends on the arena size)
 } mi_arena_t;
 
 #define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
@@ -58,6 +60,7 @@ static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
 
+
 /* -----------------------------------------------------------
   Arena id's
   id = arena_index + 1
@@ -103,6 +106,11 @@ mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
   return mi_arena_from_index(mi_arena_id_index(id));
 }
 
+static size_t mi_arena_info_slices(mi_arena_t* arena) {
+  return arena->info_slices;
+}
+
+
 
 /* -----------------------------------------------------------
   Util
@@ -114,14 +122,6 @@ static size_t mi_arena_size(mi_arena_t* arena) {
   return mi_size_of_slices(arena->slice_count);
 }
 
-static size_t mi_arena_info_slices(void) {
-  const size_t os_page_size = _mi_os_page_size();
-  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
-  const size_t info_slices  = mi_slice_count_of_size(info_size);
-  return info_slices;
-}
-
-
 // Start of the arena memory area
 static uint8_t* mi_arena_start(mi_arena_t* arena) {
   return ((uint8_t*)arena);
@@ -187,7 +187,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
 {
   size_t slice_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
+  if (!mi_bitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
 
   // claimed it!
   void* p = mi_arena_slice_start(arena, slice_index);
@@ -197,7 +197,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   // set the dirty bits
   if (arena->memid.initially_zero) {
     // size_t dirty_count = 0;
-    memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL);
+    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);
     //if (dirty_count>0) {
     //  if (memid->initially_zero) {
     //    _mi_error_message(EFAULT, "ouch1\n");
@@ -217,7 +217,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     memid->initially_committed = true;
 
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    if (!mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)) {
+    if (!mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) {
       // not fully committed: commit the full range and set the commit bits
       // (this may race and we may double-commit which is fine)
       bool commit_zero = false;
@@ -235,7 +235,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
         }
         #endif  
         size_t already_committed_count = 0;
-        mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count);
+        mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
         if (already_committed_count < slice_count) {
           // todo: also decrease total
           mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
@@ -245,13 +245,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   }
   else {
     // no need to commit, but check if already fully committed
-    memid->initially_committed = mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count);
+    memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
   }
 
-  mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
-  if (commit) { mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); }
-  mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
-  // mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
+  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); }
+  mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+  // mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
 
   return p;
 }
@@ -285,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = mi_size_of_slices(mi_arena_info_slices() + 1);
-  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_SLICE_SIZE;
+  const size_t min_reserve = 8; // hope that fits minimal bitmaps?
+  const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE;  // 16 GiB
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
   }
@@ -494,10 +494,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
         _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
 
         _mi_page_free_collect(page, false);  // update `used` count
-        mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
         mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
         mi_assert_internal(_mi_ptr_page(page)==page);
         mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
@@ -670,9 +670,9 @@ void _mi_arena_page_free(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_pairmap_is_clear(&arena->pages_abandoned[bin], slice_index));
   }
   #endif
@@ -701,10 +701,10 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
     mi_assert_internal(!mi_page_is_singleton(page));
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
     bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index);
@@ -757,9 +757,9 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index);
@@ -876,8 +876,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       return;
     }
     mi_assert_internal(slice_index < arena->slice_count);
-    mi_assert_internal(slice_index >= mi_arena_info_slices());
-    if (slice_index < mi_arena_info_slices() || slice_index > arena->slice_count) {
+    mi_assert_internal(slice_index >= mi_arena_info_slices(arena));
+    if (slice_index < mi_arena_info_slices(arena) || slice_index > arena->slice_count) {
       _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
@@ -907,7 +907,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     }
 
     // and make it available to others again
-    bool all_inuse = mi_bitmap_setN(&arena->slices_free, slice_index, slice_count, NULL);
+    bool all_inuse = mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
     if (!all_inuse) {
       _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
       return;
@@ -989,6 +989,29 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
   return true;
 }
 
+static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) {
+  if (slice_count == 0) slice_count = MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BITMAP_CHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BITMAP_CHUNK_SIZE);
+  const size_t bitmaps_size = 4 * mi_bitmap_size(slice_count,NULL);
+  const size_t pairmaps_size = MI_BIN_COUNT * 2 * mi_bitmap_size(slice_count,NULL);
+  const size_t size = base_size + bitmaps_size + pairmaps_size;
+
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page
+  const size_t info_slices = mi_slice_count_of_size(info_size);
+
+  if (bitmap_base != NULL) *bitmap_base = base_size;
+  return info_slices;
+}
+
+static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bitmap_t* bitmap = (mi_bitmap_t*)(*base);
+  *base = (*base) + mi_bitmap_init(bitmap, slice_count, true /* already zero */);
+  return bitmap;
+}
+
+
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
@@ -1003,23 +1026,25 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
 
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
 
-  const size_t info_slices = mi_arena_info_slices();
-  const size_t bcount      = size / MI_ARENA_SLICE_SIZE;  // divide down
-  if (bcount < info_slices+1) {
+  const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BITMAP_CHUNK_BITS);
+  if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now
+    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
+    return false;
+  }  
+  size_t bitmap_base;
+  const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base);
+  if (slice_count < info_slices+1) {
     _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB);
     return false;
   }
-  if (bcount > MI_BITMAP_MAX_BITS) {
-    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
-    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BITS)/MI_MiB);
-    return false;
-  }
+
   mi_arena_t* arena = (mi_arena_t*)start;
 
   // commit & zero if needed
   bool is_zero = memid.initially_zero;
   if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_slices(info_slices), &is_zero, &_mi_stats_main);
+    _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL, &_mi_stats_main);
   }
   if (!is_zero) {
     _mi_memzero(arena, mi_size_of_slices(info_slices));
@@ -1029,34 +1054,37 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->id           = _mi_arena_id_none();
   arena->memid        = memid;
   arena->exclusive    = exclusive;
-  arena->slice_count  = bcount;
+  arena->slice_count  = slice_count;
+  arena->info_slices  = info_slices;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
-  mi_bitmap_init(&arena->slices_free,true);
-  mi_bitmap_init(&arena->slices_committed,true);
-  mi_bitmap_init(&arena->slices_dirty,true);
-  mi_bitmap_init(&arena->slices_purge,true);
+  uint8_t* base = mi_arena_start(arena) + bitmap_base;
+  arena->slices_free = mi_arena_bitmap_init(slice_count,&base);
+  arena->slices_committed = mi_arena_bitmap_init(slice_count,&base);
+  arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
+  arena->slices_purge = mi_arena_bitmap_init(slice_count,&base);
   for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
-    mi_pairmap_init(&arena->pages_abandoned[i],true);
+    mi_pairmap_init(&arena->pages_abandoned[i], mi_arena_bitmap_init(slice_count, &base), mi_arena_bitmap_init(slice_count, &base));
   }
+  mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena)));
 
   // reserve our meta info (and reserve slices outside the memory area)
-  mi_bitmap_unsafe_setN(&arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
+  mi_bitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
   if (memid.initially_committed) {
-    mi_bitmap_unsafe_setN(&arena->slices_committed, 0, arena->slice_count);
+    mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count);
   }
   else {
-    mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL);
+    mi_bitmap_setN(arena->slices_committed, 0, info_slices, NULL);
   }
   if (!memid.initially_zero) {
-    mi_bitmap_unsafe_setN(&arena->slices_dirty, 0, arena->slice_count);
+    mi_bitmap_unsafe_setN(arena->slices_dirty, 0, arena->slice_count);
   }
   else {
-    mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL);
+    mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
@@ -1117,7 +1145,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   _mi_output_message("%s%s:\n", prefix, header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
-  for (int i = 0; i < MI_BITMAP_CHUNK_COUNT && bit_count < slice_count; i++) {
+  for (int i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
     char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
     for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
@@ -1161,12 +1189,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, &arena->slices_free, true);
+      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, arena->slices_free, true);
     }
-    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, &arena->slices_committed, false);
+    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, arena->slices_committed, false);
     // todo: abandoned slices
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, &arena->slices_purge, false);
+      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, arena->slices_purge, false);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
@@ -1262,7 +1290,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
   const size_t size = mi_size_of_slices(slices);
   void* const p = mi_arena_slice_start(arena, slice_index);
   bool needs_recommit;
-  if (mi_bitmap_is_setN(&arena->slices_committed, slice_index, slices)) {
+  if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) {
     // all slices are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
@@ -1277,11 +1305,11 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
   }
 
   // clear the purged slices
-  mi_bitmap_clearN(&arena->slices_purge, slices, slice_index);
+  mi_bitmap_clearN(arena->slices_purge, slices, slice_index);
 
   // update committed bitmap
   if (needs_recommit) {
-    mi_bitmap_clearN(&arena->slices_committed, slices, slice_index);
+    mi_bitmap_clearN(arena->slices_committed, slices, slice_index);
   }
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index a6c9e879..4156cfd1 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -64,7 +64,7 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
 // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
 // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, size_t* idx) {
+static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) {
   return mi_bfield_find_least_bit((set ? ~x : x), idx);
 }
 
@@ -85,7 +85,7 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
 }
 
 // Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+static inline bool mi_bfield_atomic_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   if (set) {
     return mi_bfield_atomic_set(b, idx);
   }
@@ -115,7 +115,7 @@ static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, b
 }
 
 // Set/clear a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) {
+static inline bool mi_bfield_atomic_xset2(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) {
   if (set) {
     return mi_bfield_atomic_set2(b, idx, already_xset);
   }
@@ -143,7 +143,7 @@ static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield
 }
 
 // Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) {
+static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) {
   mi_assert_internal(mask != 0);
   if (set) {
     return mi_bfield_atomic_set_mask(b, mask, already_xset);
@@ -169,7 +169,7 @@ static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx
 }
 
 // Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
-static inline bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+static inline bool mi_bfield_atomic_try_xset( mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
   return mi_bfield_atomic_xset(set, b, idx);
@@ -201,7 +201,7 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf
 
 // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
 // and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
   mi_assert_internal(mask != 0);
   if (set) {
     return mi_bfield_atomic_try_set_mask(b, mask);
@@ -228,7 +228,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by
 
 // Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
+static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
   mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
   const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
   return mi_bfield_atomic_try_xset_mask(set, b, mask);
@@ -264,7 +264,7 @@ static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfi
 
 
 // Check if all bits corresponding to a mask are set/cleared.
-static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
   if (set) {
     return mi_bfield_atomic_is_set_mask(b, mask);
@@ -276,7 +276,7 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield
 
 
 // Check if a bit is set/clear
-// static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+// static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
 //   mi_assert_internal(idx < MI_BFIELD_BITS);
 //   const mi_bfield_t mask = mi_bfield_one()<<idx;
 //   return mi_bfield_atomic_is_xset_mask(set, b, mask);
@@ -289,12 +289,12 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield
 
 // Set/clear 2 (aligned) bits within a chunk.
 // Returns true if both bits transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bitmap_chunk_xset2(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_xset) {
+static inline bool mi_bitmap_chunk_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_xset) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  mi_assert_internal((idx%2)==0);  
+  mi_assert_internal((idx%2)==0);
   return mi_bfield_atomic_xset2(set, &chunk->bfields[i], idx, all_already_xset);
 }
 
@@ -309,7 +309,7 @@ static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx,
 
 // Set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
-static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) {
+static bool mi_bitmap_chunk_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
@@ -349,7 +349,7 @@ static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx,
 
 
 // check if a pair of bits is set/clear
-static inline bool mi_bitmap_chunk_is_xset2(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
+static inline bool mi_bitmap_chunk_is_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
@@ -369,7 +369,7 @@ static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t ci
 
 
 // Check if a sequence of `n` bits within a chunk are all set/cleared.
-static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   size_t idx = cidx % MI_BFIELD_BITS;
@@ -393,7 +393,7 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz
 
 
 
-static inline bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
+static inline bool mi_bitmap_chunk_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
@@ -408,7 +408,7 @@ static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t ci
   return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx);
 }
 
-static inline bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) {
+static inline bool mi_bitmap_chunk_try_xset8(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) {
   mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
   const size_t i = byte_idx / MI_BFIELD_SIZE;
   const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
@@ -426,7 +426,7 @@ static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t b
 // Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
-static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(n>0);
   if (n==0) return true;
@@ -442,7 +442,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si
   if (m > n) { m = n; }
   mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
   mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
-  const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);    
+  const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
   if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false;
 
   // done?
@@ -509,7 +509,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 // find least 0/1-bit in a chunk and try to set/clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
 // todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
+static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
 #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -644,7 +644,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
 static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n == 0 || n > MI_BFIELD_BITS) return false;  
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
   for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     mi_bfield_t b = chunk->bfields[i];
@@ -683,14 +683,14 @@ static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_
 // find a sequence of `n` bits in a chunk with `n < MI_BITMAP_CHUNK_BITS` with all bits set,
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
+  // if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
 
   // we align an a field, and require `field_count` fields to be all clear.
   // n >= MI_BFIELD_BITS; find a first field that is 0
   const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
-  for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) 
+  for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++)
   {
     // first pre-scan for a range of fields that are all set
     bool allset = true;
@@ -721,6 +721,14 @@ static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t
 }
 
 
+static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n==1) return mi_bitmap_chunk_find_and_try_clear(chunk, pidx);
+  if (n==8) return mi_bitmap_chunk_find_and_try_clear8(chunk, pidx);
+  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
+  return mi_bitmap_chunk_find_and_try_clearN_(chunk, n, pidx);
+}
+
 // are all bits in a bitmap chunk set?
 // static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
 //   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
@@ -755,70 +763,76 @@ static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
 }
 
 /* --------------------------------------------------------------------------------
-  epochset (for now for 32-bit sets only)
+  chunkmap (for now for 32-bit sets only)
 -------------------------------------------------------------------------------- */
 
-static void mi_epochset_split(mi_epochset_t es, uint32_t* bset, size_t* epoch) {
-  *bset = (uint32_t)es;
-  *epoch = (size_t)(es >> 32);
+static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) {
+  *cmap = (mi_cmap_t)es;
+  *epoch = (mi_epoch_t)(es >> 32);
 }
 
-static mi_epochset_t mi_epochset_join(uint32_t bset, size_t epoch) {
-  return ((uint64_t)epoch << 32) | bset;
+static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) {
+  return ((mi_chunkmap_t)epoch << MI_CHUNKMAP_BITS) | cmap;
 }
 
 // setting a bit increases the epoch
-static void mi_epochset_set(_Atomic(mi_epochset_t)*es, size_t idx) {
-  mi_assert(idx < 32);
-  size_t epoch;
-  uint32_t bset;
-  mi_epochset_t es_new;
-  mi_epochset_t es_old = mi_atomic_load_relaxed(es);
+static void mi_chunkmap_set(_Atomic(mi_chunkmap_t)* cm, size_t idx) {
+  mi_assert(idx < MI_CHUNKMAP_BITS);
+  mi_epoch_t  epoch;
+  mi_cmap_t   cmap;
+  mi_chunkmap_t cm_new;
+  mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm);
   do {
-    mi_epochset_split(es_old, &bset, &epoch);
-    es_new = mi_epochset_join(bset | (MI_ZU(1)<<idx), epoch+1);
-  } while (!mi_atomic_cas_weak_acq_rel(es, &es_old, es_new));
+    mi_chunkmap_split(cm_old, &cmap, &epoch);
+    cm_new = mi_chunkmap_join(cmap | (((mi_cmap_t)1)<<idx), epoch+1);
+  } while (!mi_atomic_cas_weak_acq_rel(cm, &cm_old, cm_new));
 }
 
 // clear-ing a bit only works if the epoch didn't change (so we never clear unintended)
-static bool mi_epochset_try_clear(_Atomic(mi_epochset_t)*es, size_t idx, size_t expected_epoch) {
-  mi_assert(idx < MI_EPOCHSET_BITS);
-  size_t   epoch;
-  uint32_t bset;
-  mi_epochset_t es_new;
-  mi_epochset_t es_old = mi_atomic_load_relaxed(es);
+static bool mi_chunkmap_try_clear(_Atomic(mi_chunkmap_t)* cm, size_t idx, mi_epoch_t expected_epoch) {
+  mi_assert(idx < MI_CHUNKMAP_BITS);
+  mi_epoch_t epoch;
+  mi_cmap_t  cmap;
+  mi_chunkmap_t cm_new;
+  mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm);
   do {
-    mi_epochset_split(es_old, &bset, &epoch);
+    mi_chunkmap_split(cm_old, &cmap, &epoch);
     if (epoch != expected_epoch) return false;
-    es_new = mi_epochset_join(bset & ~(MI_ZU(1)<<idx), epoch);  // no need to increase the epoch for clearing
-  } while (!mi_atomic_cas_weak_acq_rel(es, &es_old, es_new));
+    cm_new = mi_chunkmap_join(cmap & ~(((mi_cmap_t)1)<<idx), epoch);  // no need to increase the epoch for clearing
+  } while (!mi_atomic_cas_weak_acq_rel(cm, &cm_old, cm_new));
   return true;
 }
 
 /* --------------------------------------------------------------------------------
- bitmap epochset
+ bitmap chunkmap
 -------------------------------------------------------------------------------- */
 
-static void mi_bitmap_anyset_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
-  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
-  mi_epochset_set(&bitmap->any_set, chunk_idx);
+static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
+  const size_t idx = chunk_idx % MI_CHUNKMAP_BITS;
+  mi_chunkmap_set(&bitmap->chunk_maps[cmidx], idx);
 }
 
-static bool mi_bitmap_anyset_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, size_t epoch) {
-  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
-  return mi_epochset_try_clear(&bitmap->any_set, chunk_idx, epoch);
+static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t epoch) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
+  const size_t idx = chunk_idx % MI_CHUNKMAP_BITS;
+  return mi_chunkmap_try_clear(&bitmap->chunk_maps[cmidx], idx, epoch);
 }
 
-static uint32_t mi_bitmap_anyset(mi_bitmap_t* bitmap, size_t* epoch) {
-  uint32_t bset;
-  mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, epoch);
-  return bset;
+static mi_cmap_t mi_bitmap_chunkmap(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t* epoch) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
+  mi_assert_internal(cmidx < bitmap->chunk_map_count);
+  mi_cmap_t cmap;
+  mi_chunkmap_split(mi_atomic_load_relaxed(&bitmap->chunk_maps[cmidx]), &cmap, epoch);
+  return cmap;
 }
 
-static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) {
-  size_t   epoch;
-  uint32_t bset;
-  mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, &epoch);
+static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_epoch_t epoch;
+  mi_bitmap_chunkmap(bitmap, chunk_idx, &epoch);
   return epoch;
 }
 
@@ -826,17 +840,38 @@ static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) {
  bitmap
 -------------------------------------------------------------------------------- */
 
+size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
+  mi_assert_internal((bit_count % MI_BITMAP_CHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BITMAP_CHUNK_SIZE);
+  mi_assert_internal( (size%MI_BITMAP_CHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
+}
+
 // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
+// returns the size of the bitmap
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bitmap_size(bit_count, &chunk_count);
   if (!already_zero) {
-    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
+    _mi_memzero_aligned(bitmap, size);
   }
+  bitmap->chunk_map_count = _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS);
+  mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNKMAPS);
+  bitmap->chunk_count = chunk_count;
+  mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
 }
 
 // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
 
   // first chunk
   size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
@@ -844,17 +879,17 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   size_t m = MI_BITMAP_CHUNK_BITS - cidx;
   if (m > n) { m = n; }
   mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL);
-  mi_bitmap_anyset_set(bitmap, chunk_idx);
+  mi_bitmap_chunkmap_set(bitmap, chunk_idx);
 
   // n can be large so use memset for efficiency for all in-between chunks
   chunk_idx++;
   n -= m;
   const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
   if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * (MI_BITMAP_CHUNK_BITS/8));
+    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BITMAP_CHUNK_SIZE);
     const size_t end_chunk = chunk_idx + mid_chunks;
     while (chunk_idx < end_chunk) {
-      mi_bitmap_anyset_set(bitmap, chunk_idx);
+      mi_bitmap_chunkmap_set(bitmap, chunk_idx);
       chunk_idx++;
     }
     n -= (mid_chunks * MI_BITMAP_CHUNK_BITS);
@@ -865,28 +900,29 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
     mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
     mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
     mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
-    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
   }
 }
 
 
 // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving the bitmask as is.
-static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (set) {
-    // first set the anyset since it is a conservative approximation (increases epoch)
-    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // first set the chunkmap since it is a conservative approximation (increases epoch)
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
     // then actually try to set it atomically
     return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx);
   }
   else {
-    const size_t epoch = mi_bitmap_epoch(bitmap);
+    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
     bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx);
-    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
     }
     return cleared;
   }
@@ -894,22 +930,24 @@ static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
 
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
-static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   mi_assert_internal(idx%8 == 0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+
   if (set) {
     // first set the anyset since it is a conservative approximation (increases epoch)
-    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
     // then actually try to set it atomically
     return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx);
   }
   else {
-    const size_t epoch = mi_bitmap_epoch(bitmap);
+    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
     bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx);
-    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
     }
     return cleared;
   }
@@ -919,71 +957,63 @@ static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-static bool mi_bitmap_try_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);  
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  if (n==0 || idx + n > MI_BITMAP_MAX_BITS) return false;
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
+  if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false;
 
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
 
   if (set) {
-    // first set the anyset since it is a conservative approximation (increases epoch)
-    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // first set the chunkmap since it is a conservative approximation (increases epoch)
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
     // then actually try to set it atomically
     return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n);
   }
   else {
-    const size_t epoch = mi_bitmap_epoch(bitmap);
+    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
     bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n);
-    if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
     }
     return cleared;
   }
 }
 
-bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
   if (n==1) return mi_bitmap_try_xset(set, bitmap, idx);
   if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx);
+  // todo: add 32/64 for large pages
   return mi_bitmap_try_xsetN_(set, bitmap, idx, n);
 }
 
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-
-  //TODO: specialize?
-  //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
-  //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
-
+static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {  
+  mi_assert_internal((idx%2)==0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-
+  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  
   if (set) {
-    // first set the anyset since it is a conservative approximation (increases epoch)
-    mi_bitmap_anyset_set(bitmap, chunk_idx);
+    // first set the chunkmap since it is a conservative approximation (increases epoch)
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
     // then actually try to set it atomically
-    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
+    return mi_bitmap_chunk_set2(&bitmap->chunks[chunk_idx], cidx, NULL);
   }
   else {
-    const size_t epoch = mi_bitmap_epoch(bitmap);
-    size_t already_clear = 0;
-    const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear);
-    if (already_xset != NULL) { *already_xset = already_clear; }
-    if (already_clear < n && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
+    bool already_clear = false;
+    const bool allset = mi_bitmap_chunk_clear2(&bitmap->chunks[chunk_idx], cidx, &already_clear);
+    if (!already_clear && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
     }
     return allset;
   }
@@ -991,25 +1021,67 @@ static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
+static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+
+  if (set) {
+    // first set the chunkmap since it is a conservative approximation (increases epoch)
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
+    // then actually try to set it atomically
+    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
+  }
+  else {
+    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
+    size_t already_clear = 0;
+    const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear);
+    if (already_xset != NULL) { *already_xset = already_clear; }
+    if (already_clear < n && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return allset;
+  }
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
   mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
   //TODO: specialize?
   //if (n==1) return mi_bitmap_xset(set, bitmap, idx);
+  //if (n==2) return mi_bitmap_xset(set, bitmap, idx);
   //if (n==8) return mi_bitmap_xset8(set, bitmap, idx);
   return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset);
 }
 
 
+// Is a sequence of 2 bits already all set/cleared?
+static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap));
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); 
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx);
+}
+
+
 // Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
 
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < MI_BFIELD_BITS);
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
 
   return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
@@ -1020,185 +1092,121 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
   bitmap try_find_and_clear
 -------------------------------------------------------------------------------- */
 
+typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx);
 
-#define mi_bitmap_forall_set_chunks(bitmap,tseq,name_epoch,name_chunk_idx) \
-  { uint32_t  _bit_idx; \
-    uint32_t  _start = (uint32_t)(tseq % MI_EPOCHSET_BITS); \
-    size_t    name_epoch; \
-    uint32_t _any_set = mi_bitmap_anyset(bitmap,&name_epoch); \
-    _any_set = mi_rotr32(_any_set, _start); \
-    while (mi_bsf32(_any_set,&_bit_idx)) { \
-      size_t name_chunk_idx = (_bit_idx + _start) % MI_BFIELD_BITS;
+static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun)
+{
+  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
+  
+  // start chunk index -- todo: can depend on the tseq to decrease contention between threads
+  MI_UNUSED(tseq);
+  const size_t chunk_start = 0;
+  const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS;
+  const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS;
 
-#define mi_bitmap_forall_set_chunks_end() \
-      _start += _bit_idx+1;    /* so chunk_idx calculation stays valid */ \
-      _any_set >>= _bit_idx;   /* skip scanned bits (and avoid UB with (_bit_idx+1)) */ \
-      _any_set >>= 1; \
-    } \
-  }
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx)
+  // for each chunkmap entry `i`
+  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) 
   {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
-      return true;
-    }
-    else {
-      // we may find that all are unset only on a second iteration but that is ok as
-      // _any_set is a conservative approximation.
-      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
+    size_t i = (_i + chunk_map_start);
+    if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  // adjust for the start position
+
+    const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS;
+    mi_epoch_t epoch;
+    mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &epoch);
+    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); }   // rotate right for the start position (on the first iteration)
+
+    uint32_t cmap_idx;             // one bit set of each chunk that may have bits set
+    size_t cmap_idx_shift = 0;     // shift through the cmap
+    while (mi_bsf32(cmap, &cmap_idx)) {     // find least bit that is set
+      // adjust for the start position
+      if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; }
+      // set the chunk idx
+      const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift;
+      
+      // try to find and clear N bits in that chunk
+      if (chunk_idx < mi_bitmap_chunk_count(bitmap)) {   // we can have less chunks than in the chunkmap..
+        if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) {
+          return true;
+        }
       }
+            
+      // skip to the next bit
+      cmap_idx_shift += cmap_idx+1;
+      cmap >>= cmap_idx;            // skip scanned bits (and avoid UB for `cmap_idx+1`)
+      cmap >>= 1;
     }
   }
-  mi_bitmap_forall_set_chunks_end();
+  
   return false;
 }
 
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
-      mi_assert_internal((*pidx % 8) == 0);
-      return true;
-    }
-    else {
-      // we may find that all are unset only on a second iteration but that is ok as
-      // _any_set is a conservative approximation.
-      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
-      }
-    }
+static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
+  size_t cidx;
+  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+    return true;
+  }
+  else {
+    // we may find that all are cleared only on a second iteration but that is ok as
+    // the chunkmap is a conservative approximation.
+    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+    }
+    return false;
   }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
 }
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard static bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
-  // TODO: allow spanning across chunk boundaries?
-  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
-  mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
-      return true;
-    }
-    else {
-      // we may find that all are unset only on a second iteration but that is ok as
-      // _any_set is a conservative approximation.
-      if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
-  if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);
-  if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);
-  return mi_bitmap_try_find_and_clearN_(bitmap, n, tseq, pidx);
-}
-
-
-/* --------------------------------------------------------------------------------
-  pairmap epochset
--------------------------------------------------------------------------------- */
-
-static void mi_pairmap_anyset_set(mi_pairmap_t* pairmap, size_t chunk_idx) {
-  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
-  mi_epochset_set(&pairmap->any_set, chunk_idx);
-}
-
-static bool mi_pairmap_anyset_try_clear(mi_pairmap_t* pairmap, size_t chunk_idx, size_t epoch) {
-  mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT);
-  return mi_epochset_try_clear(&pairmap->any_set, chunk_idx, epoch);
-}
-
-static uint32_t mi_pairmap_anyset(mi_pairmap_t* pairmap, size_t* epoch) {
-  uint32_t bset;
-  mi_epochset_split(mi_atomic_load_relaxed(&pairmap->any_set), &bset, epoch);
-  return bset;
-}
-
-static size_t mi_pairmap_epoch(mi_pairmap_t* pairmap) {
-  size_t   epoch;
-  uint32_t bset;
-  mi_epochset_split(mi_atomic_load_relaxed(&pairmap->any_set), &bset, &epoch);
-  return epoch;
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
+{
+  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
 }
 
 /* --------------------------------------------------------------------------------
   pairmap 
 -------------------------------------------------------------------------------- */
 
-// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true
-void mi_pairmap_init(mi_pairmap_t* pairmap, bool already_zero) {
-  if (!already_zero) {
-    _mi_memzero_aligned(pairmap, sizeof(*pairmap));
+void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) {
+  mi_assert_internal(mi_bitmap_chunk_count(bm1)==mi_bitmap_chunk_count(bm2));
+  pairmap->bitmap1 = bm1;
+  pairmap->bitmap2 = bm2;
+}
+
+static void mi_pairmap_from_pair_idx(mi_pairmap_t* pairmap, size_t pair_idx, mi_bitmap_t** bitmap, size_t* pidx) {
+  const size_t idx = 2*pair_idx;
+  const size_t maxbits = mi_bitmap_max_bits(pairmap->bitmap1);
+  mi_assert_internal(pair_idx < maxbits);
+  if (idx < maxbits) {
+    *bitmap = pairmap->bitmap1;
+    *pidx = idx;
+  }
+  else {
+    *bitmap = pairmap->bitmap2;
+    *pidx = idx - maxbits;
   }
 }
 
-/* --------------------------------------------------------------------------------
-  pairmap set/clear unconditionally
--------------------------------------------------------------------------------- */
-
-// is a pairmap entry clear?
-bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
-  const size_t idx = 2*pair_idx;
-  mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  return mi_bitmap_chunk_is_clear2(&pairmap->chunks[chunk_idx], cidx);
-}
-
-// A reader can set from busy, or a new abandoned page can set from clear
 bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx) {
-  const size_t idx = 2*pair_idx;
-  mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  // first set the anyset since it is a conservative approximation(increases epoch)
-  mi_pairmap_anyset_set(pairmap, chunk_idx/2);
-  return mi_bitmap_chunk_set2(&pairmap->chunks[chunk_idx], cidx, NULL);
+  mi_bitmap_t* bitmap;
+  size_t idx;
+  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
+  return mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx);
 }
 
-// A busy reader can clear unconditionally
-void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
-  const size_t idx = 2*pair_idx;
-  mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  const size_t epoch = mi_pairmap_epoch(pairmap);
-  bool both_already_clear = false;
-  mi_bitmap_chunk_clear2(&pairmap->chunks[chunk_idx], cidx, &both_already_clear);
-  mi_assert_internal(!both_already_clear);  // in our use cases this should not happen
-  if (!both_already_clear && epoch == mi_pairmap_epoch(pairmap)) {
-    const size_t chunk_idx1 = 2*(chunk_idx/2); // round down to even
-    mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx1];
-    mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx1 + 1];
-    if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) {
-      mi_pairmap_anyset_try_clear(pairmap, chunk_idx1/2, epoch);
-    }
-  }
+bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
+  mi_bitmap_t* bitmap;
+  size_t idx;
+  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
+  return mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx);
+}
+
+bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
+  mi_bitmap_t* bitmap;
+  size_t idx;
+  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
+  return mi_bitmap_is_xset2(MI_BIT_CLEAR, bitmap, idx);
 }
 
 
@@ -1207,8 +1215,8 @@ void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
   pairmap clear while not busy
 -------------------------------------------------------------------------------- */
 
-static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).      
+static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
   const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
@@ -1221,41 +1229,44 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b,
       while ((old&mask)==mask_busy) {  // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
-      }      
+      }
     }
     bnew = (old & ~mask);  // clear
   } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew));
   mi_assert_internal((old&mask) != mask_busy);  // we should never clear a busy page
   mi_assert_internal((old&mask) == mask); // in our case: we should only go from set to clear (when reclaiming an abandoned page from a free)
-  return true;
+  return ((old&mask) == mask);
 }
 
-static void mi_pairmap_chunk_clear_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
+static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  mi_bfield_atomic_clear_while_not_busy(&chunk->bfields[i], idx);
+  return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx);
 }
 
-// Used for a page about to be freed to clear itself from the abandoned map; it has to wait
-// for all readers to finish reading the page
-void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
-  const size_t idx = 2*pair_idx;
-  mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
+static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal((idx%2)==0);
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  const size_t epoch = mi_pairmap_epoch(pairmap);
-  mi_pairmap_chunk_clear_while_not_busy(&pairmap->chunks[chunk_idx], cidx);
-  if (epoch == mi_pairmap_epoch(pairmap)) {
-    const size_t chunk_idx1 = 2*(chunk_idx/2); // round down to even
-    mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx1];
-    mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx1 + 1];
-    if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) {
-      mi_pairmap_anyset_try_clear(pairmap, chunk_idx1/2, epoch);
-    }
-  }
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
+  bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx);
+  if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+  }  
+  return cleared;
 }
 
+void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
+  mi_bitmap_t* bitmap;
+  size_t idx;
+  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
+  mi_bitmap_clear2_while_not_busy(bitmap, idx);
+}
+
+
 
 /* --------------------------------------------------------------------------------
   pairmap try and set busy
@@ -1263,7 +1274,7 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
 
 // Atomically go from set to busy, or return false otherwise and leave the bit field as-is.
 static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).      
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
   const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
@@ -1277,11 +1288,11 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t
   return true;
 }
 
-static inline bool mi_pairmap_chunk_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) {
   for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
     size_t idx;
     if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy
-      mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).      
+      mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
       if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
         *pidx = (i*MI_BFIELD_BITS) + idx;
         mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
@@ -1292,41 +1303,36 @@ static inline bool mi_pairmap_chunk_find_and_set_busy(mi_bitmap_chunk_t* chunk,
   return false;
 }
 
+static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
+  MI_UNUSED(epoch);
+  mi_assert_internal(n==2);
+  size_t cidx;
+  if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
+    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
+  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at);
+}
+
 // Used to find an abandoned page, and transition from set to busy.
 mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) {
-  uint32_t  bit_idx;
-  uint32_t  start = (uint32_t)(tseq % MI_EPOCHSET_BITS);
-  size_t    epoch;
-  uint32_t  any_set = mi_pairmap_anyset(pairmap,&epoch);
-  any_set = mi_rotr32(any_set, start);
-  while (mi_bsf32(any_set,&bit_idx)) { \
-    size_t chunk_idx = 2*((bit_idx + start) % MI_BFIELD_BITS);
-    {
-      // look at chunk_idx and chunck_idx+1
-      mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx];
-      mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1];
-      size_t cidx;
-      if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) {
-        const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-        mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
-        mi_assert_internal((idx%2)==0);
-        *pidx = idx/2;
-        return true;
-      }
-      else if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) {
-        const size_t idx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx;
-        mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS);
-        mi_assert_internal((idx%2)==0);
-        *pidx = idx/2;
-        return true;
-      }
-      else if (epoch == mi_pairmap_epoch(pairmap) && mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk1)) {
-        mi_pairmap_anyset_try_clear(pairmap, chunk_idx/2, epoch);
-      }
+  size_t idx = 0;
+  if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) {
+    if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) {
+      return false;
+    }
+    else {
+      idx += mi_bitmap_max_bits(pairmap->bitmap1);
     }
-    start += bit_idx+1;    /* so chunk_idx computation stays valid */
-    any_set >>= bit_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */
-    any_set >>= 1;
   }
-  return false;
+  mi_assert_internal((idx%2)==0);
+  *pidx = idx/2;
+  return true;
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index 948bd1e3..9b931c95 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -34,30 +34,56 @@ typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s {
   _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
 } mi_bitmap_chunk_t;
 
-// for now 32   (note: with ABA instructions we can make this 64)
-#define MI_EPOCHSET_BITS        (32)
-#define MI_BITMAP_CHUNK_COUNT   MI_EPOCHSET_BITS
-typedef uint64_t  mi_epochset_t;
+// for now 32-bit epoch + 32-bit bit-set   (note: with ABA instructions we can double this)
+typedef uint64_t mi_chunkmap_t;
+typedef uint32_t mi_epoch_t;
+typedef uint32_t mi_cmap_t;
+
+#define MI_CHUNKMAP_BITS            (32)   // 1 chunkmap tracks 32 chunks
+
+#define MI_BITMAP_MAX_CHUNKMAPS     (16)
+#define MI_BITMAP_MAX_CHUNK_COUNT   (MI_BITMAP_MAX_CHUNKMAPS * MI_CHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT   (1 * MI_CHUNKMAP_BITS)                              // 1 GiB arena
+
+#define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  //  1 GiB arena
 
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
-  mi_bitmap_chunk_t       chunks[MI_BITMAP_CHUNK_COUNT];
-  _Atomic(mi_epochset_t)  any_set;
+  _Atomic(size_t)         chunk_map_count;
+  _Atomic(size_t)         chunk_count;        
+  _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
+  // padding
+  mi_bitmap_chunk_t       chunks[MI_BITMAP_MIN_BIT_COUNT];  // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
-// 16k bits on 64bit, 8k bits on 32bit
-// with 64KiB slices, this can address a 1GiB arena
-#define MI_BITMAP_MAX_BITS  (MI_BITMAP_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)
+static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&bitmap->chunk_map_count);
+}
+
+static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&bitmap->chunk_count);
+}
+
+static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
+  return (mi_bitmap_chunk_count(bitmap) * MI_BITMAP_CHUNK_BITS);
+}
+
+
 
 /* --------------------------------------------------------------------------------
   Atomic bitmap
 -------------------------------------------------------------------------------- */
 
-typedef bool  mi_bit_t;
+typedef bool  mi_xset_t;
 #define MI_BIT_SET    (true)
 #define MI_BIT_CLEAR  (false)
 
+
+size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
+
 // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
+// returns the size of the bitmap.
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 
 // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
@@ -65,7 +91,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
 // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
+bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
 
 static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
   return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set);
@@ -77,7 +103,7 @@ static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 
 
 // Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 
 static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
@@ -88,9 +114,29 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 
+// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
+
+
+
+
 // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
 // and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
 //
 //static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
 //  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
@@ -103,7 +149,7 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
 // and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
 //
 //static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
 //  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
@@ -113,48 +159,28 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 //  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
 //}
 
-// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n);
-}
-
-static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
-}
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
-
 
 /* --------------------------------------------------------------------------------
   Atomic bitmap for a pair of bits
 -------------------------------------------------------------------------------- */
 
-typedef mi_bfield_t     mi_pair_t;
-
 #define MI_PAIR_CLEAR   (0)
 #define MI_PAIR_BUSY    (1)
 #define MI_PAIR_UNUSED  (2)   // should never occur
 #define MI_PAIR_SET     (3)
 
-typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_pairmap_s {
-  mi_bitmap_chunk_t      chunks[2*MI_BITMAP_CHUNK_COUNT];
-  _Atomic(mi_epochset_t) any_set;
+typedef struct mi_pairmap_s {
+  mi_bitmap_t* bitmap1;
+  mi_bitmap_t* bitmap2;  
 } mi_pairmap_t;
 
-#define MI_PAIRMAP_MAX_PAIRS  (MI_BITMAP_MAX_BITS)      // 16k pairs on 64bit, 8k pairs on 32bit
-#define MI_PAIRMAP_MAX_BITS   (2*MI_PAIRMAP_MAX_PAIRS)
+
 
 // initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_pairmap_init(mi_pairmap_t* pairmap, bool already_zero);
+void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2);
 bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx);
+bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx);
 bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx);
-void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx);
 void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
 mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx);
 
diff --git a/src/page-map.c b/src/page-map.c
index 0e99a890..35a22d8d 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -22,7 +22,8 @@ static bool mi_page_map_init(void) {
   //                    64 KiB for 4 GiB address space (on 32-bit)
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
 
-  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
+  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT);
+  mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
   mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
diff --git a/test/test-stress.c b/test/test-stress.c
index 9e53e920..e49fde00 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -41,7 +41,7 @@ static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
-static int THREADS = 4;
+static int THREADS = 1;
 static int SCALE   = 100;
 static int ITER    = 10;
 #define ALLOW_LARGE false

From bc7fe399b159e548c7b42cb4fbd287e0d12bffd0 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 3 Dec 2024 23:35:33 -0800
Subject: [PATCH 029/264] large bitmaps working; lock on arena_reserve

---
 include/mimalloc/internal.h |  1 +
 src/arena.c                 | 42 ++++++++++++++++++++++++++-----------
 src/bitmap.c                |  4 ++--
 src/init.c                  |  1 +
 test/test-stress.c          |  5 +++++
 5 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 34dbab07..c92375c5 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -132,6 +132,7 @@ void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t ma
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
+void       _mi_arena_init(void);
 void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
 void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
 void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
diff --git a/src/arena.c b/src/arena.c
index f8b6fca1..bc885ef8 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -53,13 +53,19 @@ typedef struct mi_arena_s {
   // followed by the bitmaps (whose size depends on the arena size)
 } mi_arena_t;
 
-#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
+#define MI_MAX_ARENAS         (160)         // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                            // 160 arenas is enough for ~2 TiB memory
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
 
+static mi_lock_t mi_arena_reserve_lock;
+
+void _mi_arena_init(void) {
+  mi_lock_init(&mi_arena_reserve_lock);
+}
 
 /* -----------------------------------------------------------
   Arena id's
@@ -275,9 +281,9 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
-  if (arena_count >= 8 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+  if (arena_count >= 1 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries 
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); 
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
@@ -285,7 +291,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = 8; // hope that fits minimal bitmaps?
+  const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps?
   const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE;  // 16 GiB
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
@@ -380,21 +386,32 @@ static mi_decl_noinline void* mi_arena_try_alloc(
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
-
+  void* p;
+again:
   // try to find free slices in the arena's
-  void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+  p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
   if (p != NULL) return p;
 
-  // otherwise, try to first eagerly reserve a new arena
-  if (req_arena_id == _mi_arena_id_none()) {
+  // did we need a specific arena?
+  if (req_arena_id != _mi_arena_id_none()) return NULL;
+
+  // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
+  if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
     mi_arena_id_t arena_id = 0;
-    if (mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id)) {
+    bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+    mi_lock_release(&mi_arena_reserve_lock);    
+    if (ok) {
       // and try allocate in there
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
       p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
   }
+  else {
+    // if we are racing with another thread wait until the new arena is reserved (todo: a better yield?)
+    mi_atomic_yield();
+    goto again;
+  }
 
   return NULL;
 }
@@ -524,7 +541,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   // try to allocate from free space in arena's
   mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
-  if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
@@ -982,6 +999,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
+  
   _mi_stat_counter_increase(&stats->arena_count,1);
   arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
@@ -1145,7 +1163,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   _mi_output_message("%s%s:\n", prefix, header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
-  for (int i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
+  for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
     char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
     for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
diff --git a/src/bitmap.c b/src/bitmap.c
index 4156cfd1..2dbba52d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -985,7 +985,7 @@ static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx,
   }
 }
 
-bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
   if (n==1) return mi_bitmap_try_xset(set, bitmap, idx);
   if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx);
@@ -1304,7 +1304,7 @@ static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chun
 }
 
 static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-  MI_UNUSED(epoch);
+  MI_UNUSED(epoch); MI_UNUSED(n);
   mi_assert_internal(n==2);
   size_t cidx;
   if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
diff --git a/src/init.c b/src/init.c
index 99a5ea39..3dcb68e3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -619,6 +619,7 @@ void mi_process_init(void) mi_attr_noexcept {
 
   mi_detect_cpu_features();
   _mi_os_init();
+  _mi_arena_init();
   mi_heap_main_init();
   #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
diff --git a/test/test-stress.c b/test/test-stress.c
index e49fde00..904b1acc 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -50,6 +50,11 @@ static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
 #define ALLOW_LARGE false
+#elif 0
+static int THREADS = 64;
+static int SCALE = 400;
+static int ITER = 10;
+#define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 25;      // scaling factor

From 45f7fb559ace2ba1c463d0ca48dbeff62e46d117 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 4 Dec 2024 00:14:56 -0800
Subject: [PATCH 030/264] small fixes

---
 include/mimalloc/internal.h | 11 +++++++++--
 src/bitmap.h                |  3 ++-
 test/test-stress.c          |  2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c92375c5..cb689877 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -487,7 +487,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
 }
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  #if 1 // MI_DEBUG
+  #if MI_DEBUG
   return _mi_checked_ptr_page(p);
   #else
   return _mi_ptr_page_ex(p,NULL);
@@ -638,6 +638,13 @@ static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   return (page->reserved - page->used <= frac);
 }
 
+// is less than 1/n'th of a page free?
+static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / n;
+  return (page->reserved - page->used <= frac);
+}
+
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
   return (mi_atomic_load_acquire(&page->xthread_id) <= 1);
@@ -692,7 +699,7 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
 
 static inline void _mi_page_unown(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
-  mi_assert_internal(mi_page_is_abandoned(page));  
+  mi_assert_internal(mi_page_is_abandoned(page));
   mi_thread_free_t tf_new;
   mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
   do {
diff --git a/src/bitmap.h b/src/bitmap.h
index 9b931c95..d73ee98a 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -51,8 +51,9 @@ typedef uint32_t mi_cmap_t;
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
   _Atomic(size_t)         chunk_map_count;
   _Atomic(size_t)         chunk_count;        
+  size_t                  padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
-  // padding
+  
   mi_bitmap_chunk_t       chunks[MI_BITMAP_MIN_BIT_COUNT];  // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
diff --git a/test/test-stress.c b/test/test-stress.c
index 904b1acc..0b1b6c8d 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -45,7 +45,7 @@ static int THREADS = 1;
 static int SCALE   = 100;
 static int ITER    = 10;
 #define ALLOW_LARGE false
-#elif 0
+#elif 1
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;

From afe90891529058605f9bd910953304322e291aeb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 4 Dec 2024 19:15:55 -0800
Subject: [PATCH 031/264] more documentation; better pairmap
 find_and_set_to_busy, busy flag is now 0x10

---
 src/arena.c  |  88 ++++++++---------
 src/bitmap.c | 212 ++++++++++++++++++++++++++---------------
 src/bitmap.h | 125 +++++++++++++++---------
 src/free.c   | 262 ++++++---------------------------------------------
 src/page.c   |   2 +-
 5 files changed, 296 insertions(+), 393 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bc885ef8..19815616 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -199,7 +199,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   void* p = mi_arena_slice_start(arena, slice_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
-  
+
   // set the dirty bits
   if (arena->memid.initially_zero) {
     // size_t dirty_count = 0;
@@ -239,7 +239,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
             memid->initially_zero = false;
           }
         }
-        #endif  
+        #endif
         size_t already_committed_count = 0;
         mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
         if (already_committed_count < slice_count) {
@@ -247,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
           mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
         }
       }
-    }    
+    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -282,8 +282,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
   if (arena_count >= 1 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries 
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); 
+    // scale up the arena sizes exponentially every 8 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
@@ -399,7 +399,7 @@ again:
   if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
     mi_arena_id_t arena_id = 0;
     bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
-    mi_lock_release(&mi_arena_reserve_lock);    
+    mi_lock_release(&mi_arena_reserve_lock);
     if (ok) {
       // and try allocate in there
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
@@ -476,6 +476,19 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
+static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) {
+  mi_arena_t* arena = (mi_arena_t*)arg1;
+  mi_subproc_t* subproc = (mi_subproc_t*)arg2;
+
+  // found an abandoned page of the right size
+  // it is set busy for now so we can read safely even with concurrent mi_free reclaiming
+  // try to claim ownership atomically
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  if (subproc != page->subproc)           return false;
+  if (!mi_page_try_claim_ownership(page)) return false;
+  return true;
+}
+
 static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
   MI_UNUSED(slice_count);
@@ -493,38 +506,29 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   {
     size_t slice_index;
     mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin];
-    while (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index)) {  // todo: don't restart from scratch if we fail for some entry?
-      // found an abandoned page of the right size
-      // it is set busy for now so we can read safely even with concurrent mi_free reclaiming
-      // try to claim ownership atomically
-      mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
-      if (!mi_page_try_claim_ownership(page)) {
-        // a concurrent free already grabbed the page.
-        // Restore the abandoned_map to make it available again (unblocking busy waiters)
-        mi_pairmap_set(pairmap, slice_index);
-      }
-      else {
-        // we got ownership, clear the abandoned entry (unblocking busy waiters)
-        mi_pairmap_clear(pairmap, slice_index);
-        mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-        _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
-        _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
 
-        _mi_page_free_collect(page, false);  // update `used` count
-        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
-        mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
-        mi_assert_internal(_mi_ptr_page(page)==page);
-        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
-        mi_assert_internal(mi_page_block_size(page) == block_size);
-        mi_assert_internal(mi_page_is_abandoned(page));
-        mi_assert_internal(mi_page_is_owned(page));
-        mi_assert_internal(!mi_page_is_full(page));
-        return page;
-      }
-    }
+    if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) {  
+      // found an abandoned page of the right size 
+      // and claimed ownership.
+      mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+      mi_assert_internal(mi_page_is_owned(page));
+      mi_assert_internal(mi_page_is_abandoned(page));
+      mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
+      _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+
+      _mi_page_free_collect(page, false);  // update `used` count
+      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
+      mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+      mi_assert_internal(_mi_ptr_page(page)==page);
+      mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+      mi_assert_internal(mi_page_block_size(page) == block_size);
+      mi_assert_internal(!mi_page_is_full(page));
+      return page;
+    }    
   }
   mi_forall_arenas_end();
   return NULL;
@@ -565,8 +569,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
   // claimed free slices: initialize the page partly
-  if (!memid.initially_zero) { 
-    _mi_memzero_aligned(page, sizeof(*page)); 
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(page, sizeof(*page));
   }
   #if MI_DEBUG > 1
   else {
@@ -779,7 +783,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
-    mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index);
+    mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
   }
@@ -999,7 +1003,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
-  
+
   _mi_stat_counter_increase(&stats->arena_count,1);
   arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
@@ -1049,7 +1053,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
     // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
     _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
     return false;
-  }  
+  }
   size_t bitmap_base;
   const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base);
   if (slice_count < info_slices+1) {
diff --git a/src/bitmap.c b/src/bitmap.c
index 2dbba52d..1aa0a822 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -995,13 +995,13 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, s
 
 // Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {  
+static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal((idx%2)==0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  
+
   if (set) {
     // first set the chunkmap since it is a conservative approximation (increases epoch)
     mi_bitmap_chunkmap_set(bitmap, chunk_idx);
@@ -1066,7 +1066,7 @@ static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t
   mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); 
+  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx);
 }
@@ -1091,13 +1091,13 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 /* --------------------------------------------------------------------------------
   bitmap try_find_and_clear
 -------------------------------------------------------------------------------- */
-
+/*
 typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx);
 
 static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun)
 {
   if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
-  
+
   // start chunk index -- todo: can depend on the tseq to decrease contention between threads
   MI_UNUSED(tseq);
   const size_t chunk_start = 0;
@@ -1105,7 +1105,7 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq
   const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS;
 
   // for each chunkmap entry `i`
-  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) 
+  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++)
   {
     size_t i = (_i + chunk_map_start);
     if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  // adjust for the start position
@@ -1122,50 +1122,106 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq
       if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; }
       // set the chunk idx
       const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift;
-      
+
       // try to find and clear N bits in that chunk
       if (chunk_idx < mi_bitmap_chunk_count(bitmap)) {   // we can have less chunks than in the chunkmap..
         if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) {
           return true;
         }
       }
-            
+
       // skip to the next bit
       cmap_idx_shift += cmap_idx+1;
       cmap >>= cmap_idx;            // skip scanned bits (and avoid UB for `cmap_idx+1`)
       cmap >>= 1;
     }
   }
-  
+
   return false;
 }
+*/
 
-static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-  size_t cidx;
-  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
-    return true;
-  }
-  else {
-    // we may find that all are cleared only on a second iteration but that is ok as
-    // the chunkmap is a conservative approximation.
-    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return false;
-  }
-}
+#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \
+  { \
+  /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
+  MI_UNUSED(tseq); \
+  const size_t chunk_start = 0; \
+  const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \
+  const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \
+  /* for each chunkmap entry `i` */ \
+  for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \
+    size_t i = (_i + chunk_map_start); \
+    if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  /* adjust for the start position */ \
+    \
+    const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \
+    mi_epoch_t name_epoch; \
+    mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \
+    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); }   /* rotate right for the start position (on the first iteration) */ \
+    \
+    uint32_t cmap_idx;             /* one bit set of each chunk that may have bits set */ \
+    size_t   cmap_idx_shift = 0;   /* shift through the cmap */ \
+    while (mi_bsf32(cmap, &cmap_idx)) {     /* find least bit that is set */ \
+      /* adjust for the start position again */ \
+      if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \
+      /* set the chunk idx */ \
+      const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \
+      /* try to find and clear N bits in that chunk */ \
+      if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) {   /* we can have less chunks than in the chunkmap.. */ 
+
+#define mi_bitmap_forall_chunks_end() \
+      } \
+      /* skip to the next bit */ \
+      cmap_idx_shift += cmap_idx+1; \
+      cmap >>= cmap_idx;            /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \
+      cmap >>= 1; \
+    } \
+  }}
+   
+//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
+//  size_t cidx;
+//  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+//    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+//    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+//    return true;
+//  }
+//  else {
+//    // we may find that all are cleared only on a second iteration but that is ok as
+//    // the chunkmap is a conservative approximation.
+//    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+//      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+//    }
+//    return false;
+//  }
+//}
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
-  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
+  // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
+  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+      return true;
+    }
+    else {
+      // we may find that all are cleared only on a second iteration but that is ok as
+      // the chunkmap is a conservative approximation.
+      if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+      }
+      // continue
+    }
+  }
+  mi_bitmap_forall_chunks_end();
+  return false;
 }
 
 /* --------------------------------------------------------------------------------
-  pairmap 
+  pairmap
 -------------------------------------------------------------------------------- */
 
 void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) {
@@ -1215,10 +1271,10 @@ bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
   pairmap clear while not busy
 -------------------------------------------------------------------------------- */
 
-static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
+static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
+  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
   mi_bfield_t bnew;
   mi_bfield_t old = mi_atomic_load_relaxed(b);
@@ -1238,32 +1294,32 @@ static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b
   return ((old&mask) == mask);
 }
 
-static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
+static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx);
+  return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx);
 }
 
-static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) {
+static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal((idx%2)==0);
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
-  bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx);
+  bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx);
   if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
     mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-  }  
+  }
   return cleared;
 }
 
-void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
+void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
   mi_bitmap_t* bitmap;
   size_t idx;
   mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  mi_bitmap_clear2_while_not_busy(bitmap, idx);
+  mi_bitmap_clear2_once_not_busy(bitmap, idx);
 }
 
 
@@ -1274,9 +1330,9 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
 
 // Atomically go from set to busy, or return false otherwise and leave the bit field as-is.
 static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
+  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
   mi_bfield_t old;
   mi_bfield_t bnew;
@@ -1290,49 +1346,57 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t
 
 static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) {
   for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    size_t idx;
-    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy
-      mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
-      if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
-        *pidx = (i*MI_BFIELD_BITS) + idx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
-        return true;
+    while (true) {
+      const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits
+      size_t idx;
+      if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+        break; // not found: continue with the next field
+      }
+      else {
+        mi_assert_internal((idx%2)==0);
+        if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
+          return true;
+        }
+        // else: try this word once again
       }
     }
   }
   return false;
 }
 
-static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-  MI_UNUSED(epoch); MI_UNUSED(n);
-  mi_assert_internal(n==2);
-  size_t cidx;
-  if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
-    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
-    return true;
-  }
-  else {
-    return false;
-  }
-}
 
-static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at);
+static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx,
+                                            mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) 
+{
+  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  {
+    MI_UNUSED(epoch); MI_UNUSED(n);
+    mi_assert_internal(n==2);
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
+      const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal((idx%2)==0);
+      const size_t pair_idx = (idx + idx_offset)/2;
+      if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page
+        mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry
+        *ppair_idx = pair_idx;
+        return true;
+      }
+      else {
+        mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry
+        // and continue
+      }
+    }
+  }
+  mi_bitmap_forall_chunks_end();
+  return false;
 }
 
 // Used to find an abandoned page, and transition from set to busy.
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) {
-  size_t idx = 0;
-  if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) {
-    if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) {
-      return false;
-    }
-    else {
-      idx += mi_bitmap_max_bits(pairmap->bitmap1);
-    }
-  }
-  mi_assert_internal((idx%2)==0);
-  *pidx = idx/2;
-  return true;
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, 
+                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) {
+  if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true;
+  return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2);  
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index d73ee98a..ca62735b 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -13,9 +13,47 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #define MI_BITMAP_H
 
 /* --------------------------------------------------------------------------------
-  Definitions
--------------------------------------------------------------------------------- */
+  Atomic bitmaps:
 
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.
+
+  `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.
+
+   `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to a bit in the chunk map even if the chunk turns out
+      to have no bits set.
+
+      When we (potentially) set a bit in a chunk, we first update the chunkmap.
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk (and we may clear the
+      bit even though a bit is set in the chunk which is not allowed).
+
+      To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch"
+      counter that is increased everytime a bit is set. We only clear a bit if the epoch
+      stayed the same over our clear operation (so we know no other thread in the mean
+      time set a bit in any of the chunks corresponding to the chunkmap).
+      Since increasing the epoch and setting a bit must be atomic, we use only half-word
+      bits (32) (we could use 128-bit atomics if needed since modern hardware supports this)
+
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16)
+      and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB
+      The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but
+      each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's).
+
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */
+
+// A word-size bit field.
 typedef size_t mi_bfield_t;
 
 #define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
@@ -29,16 +67,18 @@ typedef size_t mi_bfield_t;
 #define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
 #define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
 
-// 512 bits on 64_bit
+// A bitmap chunk contains 512 bits of bfields on 64_bit  (256 on 32-bit)
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s {
   _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
 } mi_bitmap_chunk_t;
 
+
 // for now 32-bit epoch + 32-bit bit-set   (note: with ABA instructions we can double this)
 typedef uint64_t mi_chunkmap_t;
 typedef uint32_t mi_epoch_t;
 typedef uint32_t mi_cmap_t;
 
+
 #define MI_CHUNKMAP_BITS            (32)   // 1 chunkmap tracks 32 chunks
 
 #define MI_BITMAP_MAX_CHUNKMAPS     (16)
@@ -48,15 +88,18 @@ typedef uint32_t mi_cmap_t;
 #define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  // 16 GiB arena
 #define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  //  1 GiB arena
 
+
+// An atomic bitmap
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)         chunk_map_count;
-  _Atomic(size_t)         chunk_count;        
+  _Atomic(size_t)         chunk_map_count; // valid chunk_map's
+  _Atomic(size_t)         chunk_count;     // total count of chunks
   size_t                  padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
-  
+
   mi_bitmap_chunk_t       chunks[MI_BITMAP_MIN_BIT_COUNT];  // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
+
 static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) {
   return mi_atomic_load_relaxed(&bitmap->chunk_map_count);
 }
@@ -72,17 +115,19 @@ static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
 
 
 /* --------------------------------------------------------------------------------
-  Atomic bitmap
+  Atomic bitmap operations
 -------------------------------------------------------------------------------- */
 
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
 typedef bool  mi_xset_t;
 #define MI_BIT_SET    (true)
 #define MI_BIT_CLEAR  (false)
 
 
+// Required size of a bitmap to represent `bit_count` bits.
 size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
 
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
 // returns the size of the bitmap.
 size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 
@@ -134,56 +179,46 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 
 
 
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-// and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
-//
-//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
-//}
-//
-//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx);
-//}
-
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
-//
-//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
-//}
-//
-//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
-//}
-
-
 /* --------------------------------------------------------------------------------
-  Atomic bitmap for a pair of bits
+  Atomic bitmap for a pair of bits.
+
+  The valid pairs are CLEAR (0), SET (3), or BUSY (2).
+
+  These bit pairs are used in the abandoned pages maps: when set, the entry has
+  an available page. When we scan for an available abandoned page and find an entry SET,
+  we first set it to BUSY, and try to claim the page atomically (since it can race
+  with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`,
+  we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page
+  just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon`
+  which (busy) waits until the BUSY flag is cleared to ensure all readers are done.
+  (and pair-bit operations must therefore be release_acquire).
 -------------------------------------------------------------------------------- */
 
 #define MI_PAIR_CLEAR   (0)
-#define MI_PAIR_BUSY    (1)
-#define MI_PAIR_UNUSED  (2)   // should never occur
+#define MI_PAIR_UNUSED  (1)   // should never occur
+#define MI_PAIR_BUSY    (2)
 #define MI_PAIR_SET     (3)
 
+// 0b....0101010101010101
+#define MI_BFIELD_LO_BIT2     ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8)
+
+// A pairmap manipulates pairs of bits (and consists of 2 bitmaps)
 typedef struct mi_pairmap_s {
   mi_bitmap_t* bitmap1;
-  mi_bitmap_t* bitmap2;  
+  mi_bitmap_t* bitmap2;
 } mi_pairmap_t;
 
-
-
-// initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true
+// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true
 void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2);
 bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx);
 bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx);
 bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx);
-void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx);
+void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
+
+typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2);
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx,
+                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2
+                                                       );
 
 
-#endif // MI_XBITMAP_H
+#endif // MI_BITMAP_H
diff --git a/src/free.c b/src/free.c
index 70ef5d8a..1e07dbd2 100644
--- a/src/free.c
+++ b/src/free.c
@@ -148,15 +148,44 @@ void mi_free(void* p) mi_attr_noexcept
 }
 
 
-
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 
+static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page);
+
+// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+  #endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+
+  // and atomically reclaim the page if it was abandoned
+  bool reclaimed = !mi_tf_is_owned(tf_old);
+  if (reclaimed) {
+    mi_free_try_reclaim_mt(page);
+  }
+}
+
 static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
-#if 1
   // we own the page now..
   // safe to collect the thread atomic free list
   _mi_page_free_collect(page, false);  // update `used` count
@@ -209,237 +238,8 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
 
   // not reclaimed or free'd, unown again
   _mi_page_unown(page);
-
-#else
-  if (!mi_page_is_abandoned_mapped(page)) {
-    // singleton or OS allocated
-    if (mi_page_is_singleton(page)) {
-      // free singleton pages
-      #if MI_DEBUG>1
-      _mi_page_free_collect(page, false);  // update `used` count
-      mi_assert_internal(mi_page_all_free(page));
-      #endif
-      // we can free the page directly
-      _mi_arena_page_free(page);
-      return;
-    }
-    else {
-      const bool was_full = mi_page_is_full(page);
-      _mi_page_free_collect(page,false); // update used
-      if (mi_page_all_free(page)) {
-        // no need to unabandon as it is unmapped
-        _mi_arena_page_free(page);
-        return;
-      }
-      else if (was_full && _mi_arena_page_reabandon_full(page)) {
-        return;
-      }
-      else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) {
-        // the page has still some blocks in use (but not too many)
-        // reclaim in our heap if compatible, or otherwise abandon again
-        // todo: optimize this check further?
-        // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
-        // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-        mi_heap_t* const heap = mi_prim_get_default_heap();
-        if (heap != (mi_heap_t*)&_mi_heap_empty) {       // we did not already terminate our thread (can this happen?
-          mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-          if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-              (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-              (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-              )
-          {
-            _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
-            // make it part of our heap (no need to unabandon as is unmapped)
-            _mi_heap_page_reclaim(tagheap, page);
-            return;
-          }
-        }
-      }
-    }
-  }
-  else {
-    // don't reclaim pages that can be found for fresh page allocations
-  }
-
-  // not reclaimed or free'd, unown again
-  _mi_page_unown(page);
-#endif
 }
 
-/*
-// we own the page now..
-// safe to collect the thread atomic free list
-_mi_page_free_collect(page, false);  // update `used` count
-if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
-
-if (mi_page_all_free(page)) {
-  // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-  _mi_arena_page_unabandon(page);  // this must be before free'ing
-  // we can free the page directly
-  _mi_arena_page_free(page);
-  return;
-}
-else if (!mi_page_is_mostly_used(page)) {
-  // the page has still some blocks in use (but not too many)
-  // reclaim in our heap if compatible, or otherwise abandon again
-  // todo: optimize this check further?
-  // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
-  // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-  mi_heap_t* const heap = mi_prim_get_default_heap();
-
-  if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed
-      (heap != (mi_heap_t*)&_mi_heap_empty))       // we did not already terminate our thread (can this happen?
-  {
-    mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-    if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-        (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-        (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-        )
-    {
-      // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-      _mi_arena_page_unabandon(page);
-      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
-      // make it part of our heap
-      _mi_heap_page_reclaim(tagheap, page);
-      return;
-    }
-  }
-}
-
-// we cannot reclaim this page.. leave it abandoned
-// todo: should re-abandon or otherwise a partly used page could never be re-used if the
-// objects in it are not freed explicitly.
-_mi_page_unown(page);
-*/
-
-
-// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page, block));
-
-  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-  size_t dbgsize = mi_usable_size(block);
-  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
-  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
-  #endif
-
-  // push atomically on the page thread free list
-  mi_thread_free_t tf_new;
-  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    mi_block_set_next(page, block, mi_tf_block(tf_old));
-    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
-
-  // and atomically reclaim the page if it was abandoned
-  bool reclaimed = !mi_tf_is_owned(tf_old);
-  if (reclaimed) {
-    mi_free_try_reclaim_mt(page);
-  }
-}
-
-  /*
-  // Try to put the block on either the page-local thread free list,
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // first see if the page was abandoned and if we can reclaim it into our thread
-  if (mi_page_is_abandoned(page)) {
-    if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
-      mi_page_is_singleton(page)) {  // only one block, and we are free-ing it
-      if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
-      {
-        // the page is abandoned, try to reclaim it into our heap
-        if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
-          mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
-          // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
-          mi_free(block);  // recursively free as now it will be a local free in our heap
-          return;
-        }
-        else {
-          if (mi_page_is_abandoned(page)) {
-            // mi_assert(false);
-          }
-          // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
-        }
-      }
-    }
-  }
-
-
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));
-
-  if (mi_page_is_huge(page)) {
-    mi_assert_internal(mi_page_is_singleton(page));
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
-  }
-
-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
-}
-*/
 
 // ------------------------------------------------------
 // Usable size
diff --git a/src/page.c b/src/page.c
index e5e3f972..faef2f48 100644
--- a/src/page.c
+++ b/src/page.c
@@ -44,7 +44,7 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
   mi_assert_internal(_mi_ptr_page(page) == page);
   size_t count = 0;
   while (head != NULL) {
-    mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
     count++;
     head = mi_block_next(page, head);
   }

From bc67be4d79ff03ef824efcecd0aae1066b068b16 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 4 Dec 2024 21:40:57 -0800
Subject: [PATCH 032/264] small adjustments

---
 include/mimalloc/bits.h | 13 ++++++
 src/arena.c             | 58 +-----------------------
 src/bitmap.c            | 98 +++++++++++------------------------------
 src/bitmap.h            |  2 +-
 src/init.c              |  2 +-
 test/test-stress.c      |  4 +-
 6 files changed, 43 insertions(+), 134 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index f3bbe3bc..e1951cf7 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -314,6 +314,19 @@ static inline bool mi_bsr(size_t x, size_t* idx) {
   #endif
 }
 
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr32(uint32_t x, uint32_t* idx) {
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  unsigned long i;
+  return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
+#else
+  const size_t r = mi_clz((size_t)x);
+  *idx = (~r & (MI_SIZE_BITS - 1)) - (MI_SIZE_SIZE - sizeof(uint32_t));
+  return (x!=0);
+#endif
+}
 
 
 /* --------------------------------------------------------------------------------
diff --git a/src/arena.c b/src/arena.c
index 19815616..79a52c4d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -335,7 +335,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
   size_t _start; \
   if (req_arena_id == _mi_arena_id_none()) { \
     _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \
-    _start = (_max_arena <= 1 ? 0 : (tseq / MI_THREADS_PER_ARENA) % _max_arena); \
+    _start = (_max_arena <= 2 ? 0 : (tseq % (_max_arena-1))); \
   } \
   else { \
     _max_arena = 1; \
@@ -795,62 +795,6 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
   _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
 }
 
-/*
-bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
-  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); }
-  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
-  mi_assert_internal(_mi_ptr_page(page)==page);
-  // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned (anymore)
-
-  // note: we can access the page even it is in the meantime reclaimed by another thread since
-  // we only call this when on free (and thus there is still an object alive in the page)
-  mi_memid_t memid = page->memid;
-  if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's
-  if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false;
-
-  if mi_likely(memid.memkind == MI_MEM_ARENA) {
-    size_t slice_index;
-    mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL);
-    //if (arena->subproc != heap->tld->subproc) return false;  // only reclaim within the same subprocess
-
-    // don't reclaim more from a `free` call than half the current segments
-    // this is to prevent a pure free-ing thread to start owning too many segments
-    // (but not for out-of-arena segments as that is the main way to be reclaimed for those)
-    // if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) {
-    //  return false;
-    // }
-    const size_t bin = _mi_bin(page->block_size);
-    if (mi_bitmap_try_clear(&arena->slices_abandoned[bin], slice_index)) {
-      // we got it atomically
-      _mi_page_reclaim(heap, page);
-      mi_assert_internal(!mi_page_is_abandoned(page));
-      return true;
-    }
-    else {
-      if (mi_page_is_abandoned(page)) {
-        // mi_assert(false);
-      }
-    }
-  }
-  else {
-    // A page in OS or external memory
-    if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false;
-
-    // we use the thread_id to atomically grab ownership
-    mi_threadid_t abandoned_thread_id = 0;
-    if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) {
-      // we got it atomically
-      _mi_page_reclaim(heap, page);
-      mi_assert_internal(!mi_page_is_abandoned(page));
-      return true;
-    }
-  }
-
-
-  return false;
-}
-*/
-
 void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
   MI_UNUSED(heap);
   // TODO: implement this
diff --git a/src/bitmap.c b/src/bitmap.c
index 1aa0a822..d5578cfb 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -768,7 +768,7 @@ static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
 
 static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) {
   *cmap = (mi_cmap_t)es;
-  *epoch = (mi_epoch_t)(es >> 32);
+  if (epoch!=NULL) { *epoch = (mi_epoch_t)(es >> 32); }
 }
 
 static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) {
@@ -1091,80 +1091,50 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 /* --------------------------------------------------------------------------------
   bitmap try_find_and_clear
 -------------------------------------------------------------------------------- */
-/*
-typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx);
-
-static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun)
-{
-  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
-
-  // start chunk index -- todo: can depend on the tseq to decrease contention between threads
-  MI_UNUSED(tseq);
-  const size_t chunk_start = 0;
-  const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS;
-  const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS;
-
-  // for each chunkmap entry `i`
-  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++)
-  {
-    size_t i = (_i + chunk_map_start);
-    if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  // adjust for the start position
-
-    const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS;
-    mi_epoch_t epoch;
-    mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &epoch);
-    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); }   // rotate right for the start position (on the first iteration)
-
-    uint32_t cmap_idx;             // one bit set of each chunk that may have bits set
-    size_t cmap_idx_shift = 0;     // shift through the cmap
-    while (mi_bsf32(cmap, &cmap_idx)) {     // find least bit that is set
-      // adjust for the start position
-      if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; }
-      // set the chunk idx
-      const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift;
-
-      // try to find and clear N bits in that chunk
-      if (chunk_idx < mi_bitmap_chunk_count(bitmap)) {   // we can have less chunks than in the chunkmap..
-        if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) {
-          return true;
-        }
-      }
-
-      // skip to the next bit
-      cmap_idx_shift += cmap_idx+1;
-      cmap >>= cmap_idx;            // skip scanned bits (and avoid UB for `cmap_idx+1`)
-      cmap >>= 1;
+static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) {
+  size_t hi_chunk_map_idx = 0;
+  mi_cmap_t hi_cmap = 0;
+  for (size_t i = 1; i < mi_bitmap_chunk_map_count(bitmap); i++) {
+    mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, i, NULL);
+    if (cmap != 0) {
+      hi_chunk_map_idx = i;
+      hi_cmap = cmap;
     }
   }
-
-  return false;
+  uint32_t cmap_idx;
+  if (mi_bsr32(hi_cmap, &cmap_idx)) {
+    const size_t hi = (hi_chunk_map_idx * MI_CHUNKMAP_BITS) + cmap_idx;
+    mi_assert_internal(hi < mi_bitmap_chunk_count(bitmap));
+    return hi;
+  }
+  else {
+    return 0;
+  }
 }
-*/
 
 #define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_start = 0; \
+  const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \
   const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \
-  const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \
+  const uint32_t chunk_map_start_idx = (uint32_t)(chunk_start % MI_CHUNKMAP_BITS); \
   /* for each chunkmap entry `i` */ \
   for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \
     size_t i = (_i + chunk_map_start); \
-    if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  /* adjust for the start position */ \
+    if (i >= bitmap->chunk_map_count) { i -= bitmap->chunk_map_count; } /* adjust for the start position */ \
     \
     const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \
     mi_epoch_t name_epoch; \
     mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \
-    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); }   /* rotate right for the start position (on the first iteration) */ \
+    uint32_t cmap_idx_shift = 0;   /* shift through the cmap */ \
+    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); cmap_idx_shift = chunk_map_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
     \
     uint32_t cmap_idx;             /* one bit set of each chunk that may have bits set */ \
-    size_t   cmap_idx_shift = 0;   /* shift through the cmap */ \
     while (mi_bsf32(cmap, &cmap_idx)) {     /* find least bit that is set */ \
-      /* adjust for the start position again */ \
-      if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \
       /* set the chunk idx */ \
-      const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \
+      size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_CHUNKMAP_BITS); \
+      if (name_chunk_idx >= mi_bitmap_chunk_count(bitmap)) { name_chunk_idx -= mi_bitmap_chunk_count(bitmap); } \
       /* try to find and clear N bits in that chunk */ \
       if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) {   /* we can have less chunks than in the chunkmap.. */ 
 
@@ -1177,28 +1147,10 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq
     } \
   }}
    
-//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-//  size_t cidx;
-//  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-//    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-//    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
-//    return true;
-//  }
-//  else {
-//    // we may find that all are cleared only on a second iteration but that is ok as
-//    // the chunkmap is a conservative approximation.
-//    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-//      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-//    }
-//    return false;
-//  }
-//}
-
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
-  // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
   mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
   {
     size_t cidx;
diff --git a/src/bitmap.h b/src/bitmap.h
index ca62735b..78ee5380 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -91,7 +91,7 @@ typedef uint32_t mi_cmap_t;
 
 // An atomic bitmap
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)         chunk_map_count; // valid chunk_map's
+  _Atomic(size_t)         chunk_map_count; // valid chunk_maps entries
   _Atomic(size_t)         chunk_count;     // total count of chunks
   size_t                  padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
diff --git a/src/init.c b/src/init.c
index 3dcb68e3..353b0ce4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
diff --git a/test/test-stress.c b/test/test-stress.c
index 0b1b6c8d..61891269 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -343,9 +343,9 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  // mi_debug_show_arenas(true, true, false);
+  mi_debug_show_arenas(true, true, false);
   mi_collect(true);
-  mi_debug_show_arenas(true,true,false);
+  // mi_debug_show_arenas(true,true,false);
   #endif
   // mi_stats_print(NULL);
 #else

From 0616ee151e75329b425dd999104c2c84e2e1c3ae Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 5 Dec 2024 11:29:25 -0800
Subject: [PATCH 033/264] change to full_page_retain

---
 include/mimalloc.h       |  2 +-
 include/mimalloc/types.h |  4 ++--
 src/heap.c               |  2 +-
 src/options.c            |  4 ++--
 src/page.c               | 25 +++++++++++++++----------
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index b87e8db2..ba426488 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -379,7 +379,7 @@ typedef enum mi_option_e {
   mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
   mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
   mi_option_target_segments_per_thread, // experimental (=0)
-  mi_option_eager_abandon,              // eagerly abandon pages from the heap if suitable (to reduce memory footprint in multi-threaded code)
+  mi_option_full_page_retain,           // retain N full pages per size class (=4, lower it to reduce memory footprint in multi-thread applications)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 3d83e27a..348e2aa9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -305,7 +305,7 @@ typedef struct mi_page_s {
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  
+
   mi_heap_t*            heap;              // heap this threads belong to.
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
@@ -417,7 +417,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
-  bool                  eager_abandon;                       // `true` if this heap can abandon pages to reduce memory footprint
+  bool                  allow_page_abandon;                       // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
   #if MI_GUARDED
   size_t                guarded_size_min;                    // minimal size for guarded objects
diff --git a/src/heap.c b/src/heap.c
index 96342907..833af278 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -206,7 +206,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
   heap->no_reclaim = noreclaim;
-  heap->eager_abandon = (!noreclaim && mi_option_is_enabled(mi_option_eager_abandon));
+  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->tag        = tag;
   if (heap == tld->heap_backing) {
     _mi_random_init(&heap->random);
diff --git a/src/options.c b/src/options.c
index 1b326cc3..a6d42c58 100644
--- a/src/options.c
+++ b/src/options.c
@@ -143,7 +143,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 1,   UNINIT, MI_OPTION(eager_abandon) },
+  { 2,   UNINIT, MI_OPTION(full_page_retain) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page.c b/src/page.c
index faef2f48..9b35a4db 100644
--- a/src/page.c
+++ b/src/page.c
@@ -212,7 +212,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(page!=NULL);
 
   // collect the thread free list
-  _mi_page_thread_free_collect(page);  
+  _mi_page_thread_free_collect(page);
 
   // and the local free list
   if (page->local_free != NULL) {
@@ -264,7 +264,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
 */
 
 // called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
-void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page) 
+void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page)
 {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
@@ -381,7 +381,7 @@ void _mi_page_unfull(mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_is_in_full(page));
-  mi_assert_internal(!mi_page_heap(page)->eager_abandon);
+  mi_assert_internal(!mi_page_heap(page)->allow_page_abandon);
   if (!mi_page_is_in_full(page)) return;
 
   mi_heap_t* heap = mi_page_heap(page);
@@ -398,7 +398,7 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(!mi_page_is_in_full(page));
 
   mi_heap_t* heap = mi_page_heap(page);
-  if (heap->eager_abandon) {
+  if (heap->allow_page_abandon) {
     // abandon full pages
     _mi_page_abandon(page, pq);
   }
@@ -761,9 +761,10 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
 #define MI_MAX_CANDIDATE_SEARCH  (8)
 
+#define MI_MAX_FULL_PAGES_PER_QUEUE  (4)
 
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
   // search through the pages in "next fit" order
   #if MI_STAT
@@ -772,6 +773,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   #if MI_MAX_CANDIDATE_SEARCH > 1
   size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
   #endif
+  size_t full_page_count = 0;
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 
@@ -797,8 +799,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
     // if the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     if (!immediate_available && !mi_page_is_expandable(page)) {
-      mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
-      mi_page_to_full(page, pq);
+      full_page_count++;
+      if (full_page_count > MI_MAX_FULL_PAGES_PER_QUEUE) {
+        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+        mi_page_to_full(page, pq);
+      }
     }
     else {
       // the page has free space, make it a candidate
@@ -807,8 +812,8 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
         page_candidate = page;
         candidate_count = 0;
       }
-      else if (mi_page_all_free(page_candidate)) { 
-        _mi_page_free(page_candidate, pq); 
+      else if (mi_page_all_free(page_candidate)) {
+        _mi_page_free(page_candidate, pq);
         page_candidate = page;
       }
       else if (page->used >= page_candidate->used)  { // && !mi_page_is_mostly_used(page)) {
@@ -1000,7 +1005,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_heap_is_initialized(heap));
 
   // call potential deferred free routines
-  _mi_deferred_free(heap, false);
+  // _mi_deferred_free(heap, false);
 
   // free delayed frees from other threads (but skip contended ones)
   // _mi_heap_delayed_free_partial(heap);

From 7443ee317e189937118c93157eb7b70125ad60a3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 5 Dec 2024 17:00:23 -0800
Subject: [PATCH 034/264] tune free-ing and abandoning

---
 include/mimalloc.h          |   7 +-
 include/mimalloc/internal.h |   5 +-
 include/mimalloc/types.h    |   6 +-
 src/bitmap.c                |   8 +--
 src/free.c                  | 124 +++++++++++++++++++++---------------
 src/heap.c                  |  14 ++++
 src/init.c                  |   3 +-
 src/options.c               |   5 +-
 src/page-map.c              |  15 +++--
 src/page.c                  |  31 ++++-----
 10 files changed, 125 insertions(+), 93 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index ba426488..907ffadb 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 188   // major + 2 digits minor
+#define MI_MALLOC_VERSION 300   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -369,7 +369,6 @@ typedef enum mi_option_e {
   mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
   mi_option_purge_extend_delay,
-  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
   mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
@@ -379,7 +378,9 @@ typedef enum mi_option_e {
   mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
   mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
   mi_option_target_segments_per_thread, // experimental (=0)
-  mi_option_full_page_retain,           // retain N full pages per size class (=4, lower it to reduce memory footprint in multi-thread applications)
+  mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_full_page_retain,           // retain N full pages per size class (=2)
+  mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4) 
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index cb689877..3a8b272e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -27,6 +27,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
+#pragma warning(disable:28159)  // don't use GetVersion
+#pragma warning(disable:4996)   // don't use GetVersion
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_align(a)        __declspec(align(a))
@@ -169,6 +171,7 @@ void          _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current);
 */
 
 // "page-map.c"
+bool       _mi_page_map_init(void);
 void       _mi_page_map_register(mi_page_t* page);
 void       _mi_page_map_unregister(mi_page_t* page);
 
@@ -638,7 +641,7 @@ static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   return (page->reserved - page->used <= frac);
 }
 
-// is less than 1/n'th of a page free?
+// is more than (n-1)/n'th of a page in use?
 static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / n;
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 348e2aa9..d4c37c37 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -12,10 +12,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // This file contains the main type definitions for mimalloc:
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
-// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
-//                  where objects are allocated.
+//                  where objects of a single size are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
@@ -417,7 +415,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
-  bool                  allow_page_abandon;                       // `true` if this heap can abandon pages to reduce memory footprint
+  bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
   #if MI_GUARDED
   size_t                guarded_size_min;                    // minimal size for guarded objects
diff --git a/src/bitmap.c b/src/bitmap.c
index d5578cfb..f25c91ac 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -861,10 +861,10 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero)
   if (!already_zero) {
     _mi_memzero_aligned(bitmap, size);
   }
-  bitmap->chunk_map_count = _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS);
-  mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNKMAPS);
-  bitmap->chunk_count = chunk_count;
-  mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNK_COUNT);
+  mi_atomic_store_release(&bitmap->chunk_map_count, _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS));
+  mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_map_count) <= MI_BITMAP_MAX_CHUNKMAPS);
+  mi_atomic_store_release(&bitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
   return size;
 }
 
diff --git a/src/free.c b/src/free.c
index 1e07dbd2..0ff4bf60 100644
--- a/src/free.c
+++ b/src/free.c
@@ -23,9 +23,6 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // Free
 // ------------------------------------------------------
 
-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_block_t* block);
-
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
@@ -50,6 +47,40 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   }
 }
 
+// Forward declaration for multi-threaded collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page);
+
+// Free a block multi-threaded
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+#endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+
+  // and atomically try to collect the page if it was abandoned
+  const bool is_owned_now = !mi_tf_is_owned(tf_old);
+  if (is_owned_now) {
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_free_try_collect_mt(page);
+  }
+}
+
+
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
 // `page_start` and `block_size` fields; however these are constant and the page won't be
@@ -81,6 +112,7 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 }
 #endif
 
+
 // free a local pointer  (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
@@ -101,6 +133,7 @@ void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p)
            else mi_free_generic_mt(page,p);
 }
 
+
 // Get the segment data belonging to a pointer
 // This is just a single `and` in release mode but does further checks in debug mode
 // (and secure mode) to see if this was a valid pointer.
@@ -142,8 +175,16 @@ void mi_free(void* p) mi_attr_noexcept
     }
   }
   else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(page, p);
+    // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
+    if mi_likely(page->flags.full_aligned == 0) {
+      // blocks are aligned (and not a full page)
+      mi_block_t* const block = (mi_block_t*)p;
+      mi_free_block_mt(page,block);
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic multi-thread path
+      mi_free_generic_mt(page, p);
+    }
   }
 }
 
@@ -152,40 +193,11 @@ void mi_free(void* p) mi_attr_noexcept
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 
-static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page);
 
-// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page, block));
-
-  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-  size_t dbgsize = mi_usable_size(block);
-  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
-  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
-  #endif
-
-  // push atomically on the page thread free list
-  mi_thread_free_t tf_new;
-  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    mi_block_set_next(page, block, mi_tf_block(tf_old));
-    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
-
-  // and atomically reclaim the page if it was abandoned
-  bool reclaimed = !mi_tf_is_owned(tf_old);
-  if (reclaimed) {
-    mi_free_try_reclaim_mt(page);
-  }
-}
-
-static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
+
   // we own the page now..
   // safe to collect the thread atomic free list
   _mi_page_free_collect(page, false);  // update `used` count
@@ -202,16 +214,10 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
     _mi_arena_page_free(page);
     return;
   }
-  // 2. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  else if (!mi_page_is_mostly_used(page) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
-           !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
-           _mi_arena_page_try_reabandon_to_mapped(page)) 
-  {
-    return;
-  }
-  // 3. if the page is not too full, we can try to reclaim it for ourselves
-  else if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && 
-           !mi_page_is_mostly_used(page))
+    
+  // 2. if the page is not too full, we can try to reclaim it for ourselves
+  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && 
+      !mi_page_is_used_at_frac(page,8))
   {
     // the page has still some blocks in use (but not too many)
     // reclaim in our heap if compatible, or otherwise abandon again
@@ -222,20 +228,32 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
     if (heap != (mi_heap_t*)&_mi_heap_empty)  // we did not already terminate our thread (can this happen?
     {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-      if ((tagheap != NULL) &&                       // don't reclaim across heap object types
+      if ((tagheap != NULL) &&                         // don't reclaim across heap object types
+          (!tagheap->no_reclaim) &&                    // we are allowed to reclaim abandoned pages
           (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
           (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
-      {
-        // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-        _mi_arena_page_unabandon(page);
-        _mi_heap_page_reclaim(tagheap, page);
-        _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
-        return;
+      {        
+        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
+          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+          _mi_arena_page_unabandon(page);
+          _mi_heap_page_reclaim(tagheap, page);
+          _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+          return;
+        }
       }
     }
   }
 
+  // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  if (!mi_page_is_used_at_frac(page, 4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+    !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
+    _mi_arena_page_try_reabandon_to_mapped(page))
+  {
+    return;
+  }
+
+
   // not reclaimed or free'd, unown again
   _mi_page_unown(page);
 }
diff --git a/src/heap.c b/src/heap.c
index 833af278..2ff40930 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -208,6 +208,20 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
   heap->no_reclaim = noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->tag        = tag;
+
+  #if defined(WIN32) && (MI_ARCH_X64 || MI_ARCH_X86)
+  // disallow reclaim for threads running in the windows threadpool
+  const DWORD winVersion = GetVersion();
+  const DWORD winMajorVersion = (DWORD)(LOBYTE(LOWORD(winVersion)));
+  if (winMajorVersion >= 6) {
+    _TEB* const teb = NtCurrentTeb();
+    void* const poolData = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
+    if (poolData != NULL) {
+      heap->no_reclaim = true;
+    }
+  }  
+  #endif
+
   if (heap == tld->heap_backing) {
     _mi_random_init(&heap->random);
   }
diff --git a/src/init.c b/src/init.c
index 353b0ce4..64b31e1b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
@@ -619,6 +619,7 @@ void mi_process_init(void) mi_attr_noexcept {
 
   mi_detect_cpu_features();
   _mi_os_init();
+  _mi_page_map_init();
   _mi_arena_init();
   mi_heap_main_init();
   #if MI_DEBUG
diff --git a/src/options.c b/src/options.c
index a6d42c58..f2e9297f 100644
--- a/src/options.c
+++ b/src/options.c
@@ -143,7 +143,6 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@@ -158,7 +157,9 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
+  { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 2,   UNINIT, MI_OPTION(full_page_retain) },
+  { 4,   UNINIT, MI_OPTION(max_page_candidates) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -189,7 +190,7 @@ void _mi_options_init(void) {
     }
   }
   _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled");
-  #endif
+  #endif  
 }
 
 long _mi_option_get_fast(mi_option_t option) {
diff --git a/src/page-map.c b/src/page-map.c
index 35a22d8d..25693064 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -13,9 +13,9 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static mi_memid_t  mi_page_map_memid;
-static mi_bitmap_t mi_page_map_commit;
+static mi_bitmap_t mi_page_map_commit = { 1, MI_BITMAP_MIN_CHUNK_COUNT };
 
-static bool mi_page_map_init(void) {
+bool _mi_page_map_init(void) {
   size_t vbits = _mi_os_virtual_address_bits();
   if (vbits >= 48) vbits = 47;
   // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
@@ -23,7 +23,7 @@ static bool mi_page_map_init(void) {
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
 
   mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT);
-  mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
+  // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
   mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
@@ -57,11 +57,15 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
         bool is_zero;
         uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
         const size_t   size = mi_page_map_entries_per_commit_bit;
-        _mi_os_commit(start, size, &is_zero, NULL);
+        _mi_os_commit(start, size, &is_zero, NULL);        
         if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); }
         mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL);
       }
     }
+    #if MI_DEBUG > 0
+    _mi_page_map[idx] = 0;
+    _mi_page_map[idx+slice_count-1] = 0;
+    #endif
   }
 }
 
@@ -78,8 +82,9 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t*
 void _mi_page_map_register(mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
   if mi_unlikely(_mi_page_map == NULL) {
-    if (!mi_page_map_init()) return;
+    if (!_mi_page_map_init()) return;
   }
   mi_assert(_mi_page_map!=NULL);
   uint8_t* page_start;
diff --git a/src/page.c b/src/page.c
index 9b35a4db..056c9506 100644
--- a/src/page.c
+++ b/src/page.c
@@ -758,11 +758,6 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   Find pages with free blocks
 -------------------------------------------------------------*/
 
-// search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (8)
-
-#define MI_MAX_FULL_PAGES_PER_QUEUE  (4)
-
 // Find a page with free blocks of `page->block_size`.
 static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
@@ -770,10 +765,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   #if MI_STAT
   size_t count = 0;
   #endif
-  #if MI_MAX_CANDIDATE_SEARCH > 1
-  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
-  #endif
-  size_t full_page_count = 0;
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search  
+  long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain);
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 
@@ -783,14 +776,11 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     #if MI_STAT
     count++;
     #endif
-    #if MI_MAX_CANDIDATE_SEARCH > 1
-    candidate_count++;
-    #endif
-
+    candidate_limit--;
+    
     // collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
 
-  #if MI_MAX_CANDIDATE_SEARCH > 1
     // search up to N pages for a best candidate
 
     // is the local free list non-empty?
@@ -799,8 +789,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     // if the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     if (!immediate_available && !mi_page_is_expandable(page)) {
-      full_page_count++;
-      if (full_page_count > MI_MAX_FULL_PAGES_PER_QUEUE) {
+      full_page_retain--;
+      if (full_page_retain < 0) {
         mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
         mi_page_to_full(page, pq);
       }
@@ -810,7 +800,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
       // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
       if (page_candidate == NULL) {
         page_candidate = page;
-        candidate_count = 0;
+        candidate_limit = _mi_option_get_fast(mi_option_max_page_candidates);
       }
       else if (mi_page_all_free(page_candidate)) {
         _mi_page_free(page_candidate, pq);
@@ -820,13 +810,14 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
-      if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) {
+      if (immediate_available || candidate_limit <= 0) {
         mi_assert_internal(page_candidate!=NULL);
         break;
       }
     }
-  #else
-    // first-fit algorithm
+
+  #if 0
+    // first-fit algorithm without candidates
     // If the page contains free blocks, we are done
     if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
       break;  // pick this one

From ec9c61c066d46ad998028d83e984ff33a5fb5470 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 14:53:24 -0800
Subject: [PATCH 035/264] initial no more pairmap

---
 include/mimalloc/internal.h |   7 +-
 include/mimalloc/types.h    |   8 +-
 src/arena.c                 |  66 +--
 src/bitmap.c                | 937 ++++++++++++++----------------------
 src/bitmap.h                | 158 +++---
 5 files changed, 465 insertions(+), 711 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3a8b272e..d9c2cd6e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -700,7 +700,9 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
   return ((old&1)==0);
 }
 
-static inline void _mi_page_unown(mi_page_t* page) {
+// release ownership of a page. This may free the page if all blocks were concurrently
+// freed in the meantime. Returns true if the page was freed.
+static inline bool _mi_page_unown(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_thread_free_t tf_new;
@@ -712,13 +714,14 @@ static inline void _mi_page_unown(mi_page_t* page) {
       if (mi_page_all_free(page)) {        // it may become free just before unowning it
         _mi_arena_page_unabandon(page);
         _mi_arena_page_free(page);
-        return;
+        return true;
       }
       tf_old = mi_atomic_load_relaxed(&page->xthread_free);
     }
     mi_assert_internal(mi_tf_block(tf_old)==NULL);
     tf_new = mi_tf_create(NULL, false);
   } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
+  return false;
 }
 
 //-----------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d4c37c37..d78dbc59 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -117,16 +117,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
 #endif
-#ifndef MI_BITMAP_CHUNK_BITS_SHIFT
-#define MI_BITMAP_CHUNK_BITS_SHIFT        (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
+#ifndef MI_BCHUNK_BITS_SHIFT
+#define MI_BCHUNK_BITS_SHIFT        (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
 
-#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
+#define MI_BCHUNK_BITS              (1 << MI_BCHUNK_BITS_SHIFT)
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 
 #define MI_ARENA_MIN_OBJ_SLICES           (1)
-#define MI_ARENA_MAX_OBJ_SLICES           (MI_BITMAP_CHUNK_BITS)      // 32 MiB (for now, cannot cross chunk boundaries)
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)      // 32 MiB (for now, cannot cross chunk boundaries)
 
 #define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
 #define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
diff --git a/src/arena.c b/src/arena.c
index 79a52c4d..fd609fe0 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -48,7 +48,7 @@ typedef struct mi_arena_s {
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
   mi_bitmap_t*        slices_purge;         // can the slice be purged? (slice in purge => slice in free)
   mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
-  mi_pairmap_t        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+  mi_bitmap_t*        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
   // followed by the bitmaps (whose size depends on the arena size)
 } mi_arena_t;
@@ -476,16 +476,24 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) {
-  mi_arena_t* arena = (mi_arena_t*)arg1;
-  mi_subproc_t* subproc = (mi_subproc_t*)arg2;
-
+static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) {
   // found an abandoned page of the right size
-  // it is set busy for now so we can read safely even with concurrent mi_free reclaiming
-  // try to claim ownership atomically
-  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
-  if (subproc != page->subproc)           return false;
-  if (!mi_page_try_claim_ownership(page)) return false;
+  mi_arena_t* const   arena   = (mi_arena_t*)arg1;
+  mi_subproc_t* const subproc = (mi_subproc_t*)arg2;
+  mi_page_t* const    page    = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  // can we claim ownership?
+  if (!mi_page_try_claim_ownership(page)) {
+    *keep_abandoned = true;
+    return false;
+  }
+  if (subproc != page->subproc) {
+    // wrong sub-process.. we need to unown again, and perhaps not keep it abandoned
+    const bool freed = _mi_page_unown(page);
+    *keep_abandoned = !freed;
+    return false;
+  }
+  // yes, we can reclaim it
+  *keep_abandoned = false;
   return true;
 }
 
@@ -505,10 +513,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena)
   {
     size_t slice_index;
-    mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin];
+    mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
 
-    if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) {  
-      // found an abandoned page of the right size 
+    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) {
+      // found an abandoned page of the right size
       // and claimed ownership.
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
       mi_assert_internal(mi_page_is_owned(page));
@@ -528,7 +536,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       mi_assert_internal(mi_page_block_size(page) == block_size);
       mi_assert_internal(!mi_page_is_full(page));
       return page;
-    }    
+    }
   }
   mi_forall_arenas_end();
   return NULL;
@@ -694,7 +702,7 @@ void _mi_arena_page_free(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
-    mi_assert_internal(mi_pairmap_is_clear(&arena->pages_abandoned[bin], slice_index));
+    mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
   }
   #endif
 
@@ -728,8 +736,8 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
-    bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index);
-    MI_UNUSED(were_zero); mi_assert_internal(were_zero);
+    const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
+    MI_UNUSED(wasclear); mi_assert_internal(wasclear);
     mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]);
   }
   else {
@@ -783,7 +791,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
-    mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index);
+    mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
   }
@@ -956,12 +964,12 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
 }
 
 static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) {
-  if (slice_count == 0) slice_count = MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal((slice_count % MI_BITMAP_CHUNK_BITS) == 0);
-  const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BITMAP_CHUNK_SIZE);
-  const size_t bitmaps_size = 4 * mi_bitmap_size(slice_count,NULL);
-  const size_t pairmaps_size = MI_BIN_COUNT * 2 * mi_bitmap_size(slice_count,NULL);
-  const size_t size = base_size + bitmaps_size + pairmaps_size;
+  if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE);
+  const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL);
+  const size_t size = base_size + bitmaps_size;
 
   const size_t os_page_size = _mi_os_page_size();
   const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page
@@ -992,7 +1000,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
 
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
 
-  const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BITMAP_CHUNK_BITS);
+  const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BCHUNK_BITS);
   if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now
     // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
     _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
@@ -1034,7 +1042,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_purge = mi_arena_bitmap_init(slice_count,&base);
   for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
-    mi_pairmap_init(&arena->pages_abandoned[i], mi_arena_bitmap_init(slice_count, &base), mi_arena_bitmap_init(slice_count, &base));
+    arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base);
   }
   mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena)));
 
@@ -1112,9 +1120,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
-    char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
-    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
-    for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+    char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
+    mi_bchunk_t* chunk = &bitmap->chunks[i];
+    for (size_t j = 0, k = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
         buf[k++] = '\n';
         _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
diff --git a/src/bitmap.c b/src/bitmap.c
index f25c91ac..7df46070 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -77,50 +77,41 @@ static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
 }
 
 // Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
-static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
+// `all_clear` is set if the new bfield is zero.
+static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
   mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
   return ((old&mask) == mask);
 }
 
+// Clear a bit but only when/once it is set. This is used by concurrent free's while
+// the page is abandoned and mapped. 
+static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  do {
+    if mi_unlikely((old&mask) == 0) {
+      old = mi_atomic_load_acquire(b);
+      if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
+      while ((old&mask)==0) { // busy wait
+        mi_atomic_yield();
+        old = mi_atomic_load_acquire(b);
+      }
+    }
+  } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));  
+  mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
+}
+
 // Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
 static inline bool mi_bfield_atomic_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
   if (set) {
     return mi_bfield_atomic_set(b, idx);
   }
   else {
-    return mi_bfield_atomic_clear(b, idx);
-  }
-}
-
-// Set a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
-static inline bool mi_bfield_atomic_set2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_set) {
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const size_t mask = (mi_bfield_t)0x03 << idx;
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { };  // try to atomically set the mask bits until success
-  if (all_already_set!=NULL) { *all_already_set = ((old&mask)==mask); }
-  return ((old&mask) == 0);
-}
-
-// Clear a pair of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
-static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_clear) {
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const size_t mask = (mi_bfield_t)0x03 << idx;
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { };  // try to atomically clear the mask bits until success
-  if (all_already_clear!=NULL) { *all_already_clear = ((old&mask) == 0); }
-  return ((old&mask) == mask);
-}
-
-// Set/clear a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-static inline bool mi_bfield_atomic_xset2(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) {
-  if (set) {
-    return mi_bfield_atomic_set2(b, idx, already_xset);
-  }
-  else {
-    return mi_bfield_atomic_clear2(b, idx, already_xset);
+    return mi_bfield_atomic_clear(b, idx, NULL);
   }
 }
 
@@ -162,10 +153,13 @@ static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx)
 }
 
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
-// `allclear` is set to true if the new bfield is all zeros (and false otherwise)
-static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx) {
+// `all_clear` is set to true if the new bfield is zero (and false otherwise)
+static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  return mi_bfield_atomic_clear(b, idx);
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  const mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
 }
 
 // Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
@@ -189,25 +183,31 @@ static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfie
 
 // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
 // and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
   do {
-    if ((old&mask) != mask) return false; // the mask bits are no longer set
+    if ((old&mask) != mask) {  
+      // the mask bits are no longer set
+      if (all_clear != NULL) { *all_clear = (old==0); }
+      return false; 
+    }
   } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
+  if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); }
   return true;
 }
 
 
 // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
 // and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear ) {
   mi_assert_internal(mask != 0);
   if (set) {
+    if (all_clear != NULL) { *all_clear = false; }
     return mi_bfield_atomic_try_set_mask(b, mask);
   }
   else {
-    return mi_bfield_atomic_try_clear_mask(b, mask);
+    return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
   }
 }
 
@@ -220,36 +220,36 @@ static inline bool mi_bfield_atomic_try_set8(_Atomic(mi_bfield_t)*b, size_t byte
 }
 
 // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
-static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
+static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) {
   mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
   const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_clear_mask(b, mask);
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
 
-// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_xset_mask(set, b, mask);
-}
+//// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+//// and false otherwise (leaving the bit field as is).
+//static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
+//  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+//  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+//  return mi_bfield_atomic_try_xset_mask(set, b, mask);
+//}
+
 
 // Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's.
 // and false otherwise leaving the bit field as-is.
 static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) {
   mi_bfield_t old = 0;
-  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_all_set());
+  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set());
 }
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
 static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
   mi_bfield_t old = mi_bfield_all_set();
-  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero());
+  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero());
 }
 
 
-
 // Check if all bits corresponding to a mask are set.
 static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
@@ -287,90 +287,75 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfiel
  bitmap chunks
 -------------------------------------------------------------------------------- */
 
-// Set/clear 2 (aligned) bits within a chunk.
-// Returns true if both bits transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bitmap_chunk_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_xset) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+// ------ xset --------
+
+//static inline bool mi_bchunk_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) {
+//  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+//  const size_t i = cidx / MI_BFIELD_BITS;
+//  const size_t idx = cidx % MI_BFIELD_BITS;
+//  return mi_bfield_atomic_xset(set, &chunk->bfields[i], idx);
+//}
+
+static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  mi_assert_internal((idx%2)==0);
-  return mi_bfield_atomic_xset2(set, &chunk->bfields[i], idx, all_already_xset);
+  return mi_bfield_atomic_set(&chunk->bfields[i], idx);
 }
 
-static inline bool mi_bitmap_chunk_set2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_set) {
-  return mi_bitmap_chunk_xset2(MI_BIT_SET, chunk, cidx, all_already_set);
-}
-
-static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_clear) {
-  return mi_bitmap_chunk_xset2(MI_BIT_CLEAR, chunk, cidx, all_already_clear);
+static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear);
 }
 
 
 // Set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
-static bool mi_bitmap_chunk_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) {
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
+static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_xset) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
-  size_t all_already_xset = 0;
+  size_t total_already_xset = 0;
   size_t idx   = cidx % MI_BFIELD_BITS;
   size_t field = cidx / MI_BFIELD_BITS;
   while (n > 0) {
     size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
     if (m > n) { m = n; }
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
     const mi_bfield_t mask = mi_bfield_mask(m, idx);
     size_t already_xset = 0;
     const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
-    if (already_xset > 0 && transition) {
-      _mi_error_message(EFAULT, "ouch\n");
-    }
+    mi_assert_internal((transition && already_xset == m) || (!transition && already_xset > 0));
     all_transition = all_transition && transition;
-    all_already_xset += already_xset;
+    total_already_xset += already_xset;
     // next field
     field++;
     idx = 0;
     n -= m;
   }
-  if (pall_already_xset!=NULL) { *pall_already_xset = all_already_xset; }
+  if (palready_xset!=NULL) { *palready_xset = total_already_xset; }
   return all_transition;
 }
 
 
-static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
-  return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set);
+static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set);
 }
 
-static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) {
-  return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear);
+static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) {
+  return mi_bchunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear);
 }
 
 
-// check if a pair of bits is set/clear
-static inline bool mi_bitmap_chunk_is_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  mi_assert_internal((idx%2)==0);
-  const size_t mask = (mi_bfield_t)0x03 << idx;
-  return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mask);
-}
-
-static inline bool mi_bitmap_chunk_is_set2(mi_bitmap_chunk_t* chunk, size_t cidx) {
-  return mi_bitmap_chunk_is_xset2(MI_BIT_SET, chunk, cidx);
-}
-
-static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t cidx) {
-  return mi_bitmap_chunk_is_xset2(MI_BIT_CLEAR, chunk, cidx);
-}
 
+// ------ is_xset --------
 
 // Check if a sequence of `n` bits within a chunk are all set/cleared.
-static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
+static bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   size_t idx = cidx % MI_BFIELD_BITS;
   size_t field = cidx / MI_BFIELD_BITS;
@@ -378,7 +363,7 @@ static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, si
     size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
     if (m > n) { m = n; }
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
     const size_t mask = mi_bfield_mask(m, idx);
     if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) {
       return false;
@@ -392,71 +377,91 @@ static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, si
 }
 
 
+// ------ try_xset --------
 
-static inline bool mi_bitmap_chunk_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+static inline bool mi_bchunk_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
   return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx);
 }
 
-static inline bool mi_bitmap_chunk_try_set(mi_bitmap_chunk_t* chunk, size_t cidx) {
-  return mi_bitmap_chunk_try_xset(MI_BIT_SET, chunk, cidx);
+static inline bool mi_bchunk_try_set(mi_bchunk_t* chunk, size_t cidx) {
+  return mi_bchunk_try_xset(MI_BIT_SET, chunk, cidx);
 }
 
-static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t cidx) {
-  return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx);
+static inline bool mi_bchunk_try_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, maybe_all_clear);
 }
 
-static inline bool mi_bitmap_chunk_try_xset8(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) {
-  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
+
+//static inline bool mi_bchunk_try_xset8(mi_xset_t set, mi_bchunk_t* chunk, size_t byte_idx) {
+//  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
+//  const size_t i = byte_idx / MI_BFIELD_SIZE;
+//  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+//  return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx);
+//}
+
+static inline bool mi_bchunk_try_set8(mi_bchunk_t* chunk, size_t byte_idx) {
+  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
   const size_t i = byte_idx / MI_BFIELD_SIZE;
   const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx);
+  return mi_bfield_atomic_try_set8(&chunk->bfields[i], ibyte_idx);
 }
 
-static inline bool mi_bitmap_chunk_try_set8(mi_bitmap_chunk_t* chunk, size_t byte_idx) {
-  return mi_bitmap_chunk_try_xset8(MI_BIT_SET, chunk, byte_idx);
+static inline bool mi_bchunk_try_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) {
+  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
+  const size_t i = byte_idx / MI_BFIELD_SIZE;
+  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_try_clear8(&chunk->bfields[i], ibyte_idx, maybe_all_clear);
 }
 
-static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t byte_idx) {
-  return mi_bitmap_chunk_try_xset8(MI_BIT_CLEAR, chunk, byte_idx);
-}
 
 // Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
-static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);
+static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   if (n==0) return true;
   size_t start_idx = cidx % MI_BFIELD_BITS;
   size_t start_field = cidx / MI_BFIELD_BITS;
-  size_t end_field = MI_BITMAP_CHUNK_FIELDS;
+  size_t end_field = MI_BCHUNK_FIELDS;
   mi_bfield_t mask_mid = 0;
   mi_bfield_t mask_end = 0;
+  bool field_is_clear;
+  bool maybe_all_clear = true;
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = false; }
 
   // first field
   size_t field = start_field;
   size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
   if (m > n) { m = n; }
   mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
-  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
+  mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
   const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
-  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false;
+  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start, &field_is_clear)) return false;
+  maybe_all_clear = maybe_all_clear && field_is_clear;
 
   // done?
   n -= m;
-  if (n==0) return true;
+  if (n==0) {
+    if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
+    return true;
+  }
 
   // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
 
   // mid fields
   while (n >= MI_BFIELD_BITS) {
     field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
     mask_mid = mi_bfield_all_set();
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid, &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
     n -= MI_BFIELD_BITS;
   }
 
@@ -464,12 +469,14 @@ static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, s
   if (n > 0) {
     mi_assert_internal(n < MI_BFIELD_BITS);
     field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
     end_field = field;
     mask_end = mi_bfield_mask(n, 0);
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
   }
 
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
   return true;
 
 restore:
@@ -483,14 +490,23 @@ restore:
   return false;
 }
 
-static inline bool mi_bitmap_chunk_try_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  return mi_bitmap_chunk_try_xsetN(MI_BIT_SET, chunk, cidx, n);
+static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL);
 }
 
-static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n);
+static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear);
 }
 
+static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
+}
+
+// ------ find_and_try_xset --------
+
 #if defined(__AVX2__)
 static inline __m256i mi_mm256_zero(void) {
   return _mm256_setzero_si256();
@@ -507,10 +523,10 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 #endif
 
 // find least 0/1-bit in a chunk and try to set/clear it atomically
-// set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
-#if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t* pidx) {
+#if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
@@ -519,18 +535,18 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
     size_t cidx;
     if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
       if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
         *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
       }
     }
     // try again
   }
-#elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512)
+#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     size_t chunk_idx = 0;
     #if 1
@@ -559,24 +575,24 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     const size_t chunk_idx = _tzcnt_u64(mask) / 8;
     #endif
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
     size_t cidx;
     if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
       if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
         *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
       }
     }
     // try again
   }
 #else
-  for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     size_t idx;
     if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit
       if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) {  // try to set it atomically
         *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
       }
     }
@@ -585,38 +601,38 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch
 #endif
 }
 
-static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
+static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) {
+  return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
 }
 
-static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
+static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) {
+  return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
 }
 
 
 // find least byte in a chunk with all bits set, and try unset it atomically
-// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while(true) {
     const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1  : 0)
     const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
     if (mask == 0) return false;
     const size_t i = _tzcnt_u32(mask);
-    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
+    mi_assert_internal(8*i < MI_BCHUNK_BITS);
     const size_t chunk_idx = i / MI_BFIELD_SIZE;
     const size_t byte_idx  = i % MI_BFIELD_SIZE;
     if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
-      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
       return true;
     }
     // try again
   }
   #else
-    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
       const mi_bfield_t x = chunk->bfields[i];
       // has_set8 has low bit in each byte set if the byte in x == 0xFF
       const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
@@ -627,9 +643,9 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
         mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
         mi_assert_internal((idx%8)==0);
         const size_t byte_idx = idx/8;
-        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
+        if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) {  // unset the byte atomically
           *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
+          mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
           return true;
         }
         // else continue
@@ -642,11 +658,11 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk,
 
 // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
 // and try to clear them atomically.
-// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+  for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     mi_bfield_t b = chunk->bfields[i];
     size_t bshift = 0;
     size_t idx;
@@ -657,10 +673,10 @@ static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_
 
       if ((b&mask) == mask) { // found a match
         mi_assert_internal( ((mask << bshift) >> bshift) == mask );
-        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<<bshift)) {
+        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<<bshift,NULL)) {
           *pidx = (i*MI_BFIELD_BITS) + bshift;
-          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+          mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
           return true;
         }
         else {
@@ -680,23 +696,23 @@ static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_
   return false;
 }
 
-// find a sequence of `n` bits in a chunk with `n < MI_BITMAP_CHUNK_BITS` with all bits set,
+// find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set,
 // and try to clear them atomically.
-// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;  // cannot be more than a chunk
-  // if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  // if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
 
   // we align an a field, and require `field_count` fields to be all clear.
   // n >= MI_BFIELD_BITS; find a first field that is 0
   const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
-  for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++)
+  for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
   {
     // first pre-scan for a range of fields that are all set
     bool allset = true;
     size_t j = 0;
     do {
-      mi_assert_internal(i + j < MI_BITMAP_CHUNK_FIELDS);
+      mi_assert_internal(i + j < MI_BCHUNK_FIELDS);
       mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
       if (~b != 0) {
         allset = false;
@@ -708,11 +724,11 @@ static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_
     // if all set, we can try to atomically clear them
     if (allset) {
       const size_t cidx = i*MI_BFIELD_BITS;
-      if (mi_bitmap_chunk_try_clearN(chunk, cidx, n)) {
+      if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
         // we cleared all atomically
         *pidx = cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+        mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
         return true;
       }
     }
@@ -721,87 +737,43 @@ static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_
 }
 
 
-static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bitmap_chunk_find_and_try_clear(chunk, pidx);
-  if (n==8) return mi_bitmap_chunk_find_and_try_clear8(chunk, pidx);
-  if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx);
-  return mi_bitmap_chunk_find_and_try_clearN_(chunk, n, pidx);
+static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx);
+  if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx);
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
+  return mi_bchunk_find_and_try_clearN_(chunk, n, pidx);
 }
 
-// are all bits in a bitmap chunk set?
-// static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
-//   #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-//   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-//   return _mm256_test_all_ones(vec);
-//   #else
-//   // written like this for vectorization
-//   mi_bfield_t x = chunk->bfields[0];
-//   for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-//     x = x & chunk->bfields[i];
-//   }
-//   return (~x == 0);
-//   #endif
-// }
 
-// are all bits in a bitmap chunk clear?
-static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return mi_mm256_is_zero(vec);
-  #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512)
-  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
-  if (!mi_mm256_is_zero(vec1)) return false;
-  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
-  return (mi_mm256_is_zero(vec2));
-  #else
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    if (chunk->bfields[i] != 0) return false;
+// are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads)
+static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
+  for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
   }
   return true;
+}
+
+// are all bits in a bitmap chunk clear?
+static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_zero(vec);
+  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_zero(_mm256_or_epi64(vec1,vec2)));
+  #else
+  return mi_bchunk_all_are_clear(chunk);
   #endif
 }
 
+
 /* --------------------------------------------------------------------------------
-  chunkmap (for now for 32-bit sets only)
+  chunkmap
 -------------------------------------------------------------------------------- */
 
-static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) {
-  *cmap = (mi_cmap_t)es;
-  if (epoch!=NULL) { *epoch = (mi_epoch_t)(es >> 32); }
-}
-
-static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) {
-  return ((mi_chunkmap_t)epoch << MI_CHUNKMAP_BITS) | cmap;
-}
-
-// setting a bit increases the epoch
-static void mi_chunkmap_set(_Atomic(mi_chunkmap_t)* cm, size_t idx) {
-  mi_assert(idx < MI_CHUNKMAP_BITS);
-  mi_epoch_t  epoch;
-  mi_cmap_t   cmap;
-  mi_chunkmap_t cm_new;
-  mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm);
-  do {
-    mi_chunkmap_split(cm_old, &cmap, &epoch);
-    cm_new = mi_chunkmap_join(cmap | (((mi_cmap_t)1)<<idx), epoch+1);
-  } while (!mi_atomic_cas_weak_acq_rel(cm, &cm_old, cm_new));
-}
-
-// clear-ing a bit only works if the epoch didn't change (so we never clear unintended)
-static bool mi_chunkmap_try_clear(_Atomic(mi_chunkmap_t)* cm, size_t idx, mi_epoch_t expected_epoch) {
-  mi_assert(idx < MI_CHUNKMAP_BITS);
-  mi_epoch_t epoch;
-  mi_cmap_t  cmap;
-  mi_chunkmap_t cm_new;
-  mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm);
-  do {
-    mi_chunkmap_split(cm_old, &cmap, &epoch);
-    if (epoch != expected_epoch) return false;
-    cm_new = mi_chunkmap_join(cmap & ~(((mi_cmap_t)1)<<idx), epoch);  // no need to increase the epoch for clearing
-  } while (!mi_atomic_cas_weak_acq_rel(cm, &cm_old, cm_new));
-  return true;
-}
 
 /* --------------------------------------------------------------------------------
  bitmap chunkmap
@@ -809,31 +781,22 @@ static bool mi_chunkmap_try_clear(_Atomic(mi_chunkmap_t)* cm, size_t idx, mi_epo
 
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
-  const size_t idx = chunk_idx % MI_CHUNKMAP_BITS;
-  mi_chunkmap_set(&bitmap->chunk_maps[cmidx], idx);
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
 }
 
-static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t epoch) {
+static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
-  const size_t idx = chunk_idx % MI_CHUNKMAP_BITS;
-  return mi_chunkmap_try_clear(&bitmap->chunk_maps[cmidx], idx, epoch);
-}
-
-static mi_cmap_t mi_bitmap_chunkmap(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t* epoch) {
-  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS;
-  mi_assert_internal(cmidx < bitmap->chunk_map_count);
-  mi_cmap_t cmap;
-  mi_chunkmap_split(mi_atomic_load_relaxed(&bitmap->chunk_maps[cmidx]), &cmap, epoch);
-  return cmap;
-}
-
-static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx) {
-  mi_epoch_t epoch;
-  mi_bitmap_chunkmap(bitmap, chunk_idx, &epoch);
-  return epoch;
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation.
+  if (!mi_bchunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
+    return false;
+  }
+  return true;
 }
 
 /* --------------------------------------------------------------------------------
@@ -841,14 +804,14 @@ static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx
 -------------------------------------------------------------------------------- */
 
 size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
-  mi_assert_internal((bit_count % MI_BITMAP_CHUNK_BITS) == 0);
-  bit_count = _mi_align_up(bit_count, MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
   mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
   mi_assert_internal(bit_count > 0);
-  const size_t chunk_count = bit_count / MI_BITMAP_CHUNK_BITS;
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
   mi_assert_internal(chunk_count >= 1);
-  const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BITMAP_CHUNK_SIZE);
-  mi_assert_internal( (size%MI_BITMAP_CHUNK_SIZE) == 0 );
+  const size_t size = sizeof(mi_bitmap_t) + ((chunk_count - 1) * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
   if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
   return size;
 }
@@ -861,8 +824,6 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero)
   if (!already_zero) {
     _mi_memzero_aligned(bitmap, size);
   }
-  mi_atomic_store_release(&bitmap->chunk_map_count, _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS));
-  mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_map_count) <= MI_BITMAP_MAX_CHUNKMAPS);
   mi_atomic_store_release(&bitmap->chunk_count, chunk_count);
   mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
   return size;
@@ -874,32 +835,39 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
 
   // first chunk
-  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  size_t m = MI_BCHUNK_BITS - cidx;
   if (m > n) { m = n; }
-  mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL);
+  mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL);
   mi_bitmap_chunkmap_set(bitmap, chunk_idx);
 
   // n can be large so use memset for efficiency for all in-between chunks
   chunk_idx++;
   n -= m;
-  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
+  const size_t mid_chunks = n / MI_BCHUNK_BITS;
   if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BITMAP_CHUNK_SIZE);
+    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
     const size_t end_chunk = chunk_idx + mid_chunks;
     while (chunk_idx < end_chunk) {
-      mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-      chunk_idx++;
+      if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) {
+        // optimize: we can set a full bfield in the chunkmap
+        mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set());
+        chunk_idx += MI_BFIELD_BITS;
+      }
+      else {
+        mi_bitmap_chunkmap_set(bitmap, chunk_idx);
+        chunk_idx++;
+      }
     }
-    n -= (mid_chunks * MI_BITMAP_CHUNK_BITS);
+    n -= (mid_chunks * MI_BCHUNK_BITS);
   }
 
   // last chunk
   if (n > 0) {
-    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
+    mi_assert_internal(n < MI_BCHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
     mi_bitmap_chunkmap_set(bitmap, chunk_idx);
   }
 }
@@ -909,22 +877,19 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 // and false otherwise leaving the bitmask as is.
 static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (set) {
-    // first set the chunkmap since it is a conservative approximation (increases epoch)
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-    // then actually try to set it atomically
-    return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx);
+    const bool ok = mi_bchunk_try_set(&bitmap->chunks[chunk_idx], cidx);
+    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
+    return ok;
   }
   else {
-    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
-    bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx);
-    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return cleared;
+    bool maybe_all_clear;
+    const bool ok = mi_bchunk_try_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return ok;
   }
 }
 
@@ -933,126 +898,107 @@ static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
 static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   mi_assert_internal(idx%8 == 0);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t byte_idx  = (idx % MI_BCHUNK_BITS)/8;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-
   if (set) {
-    // first set the anyset since it is a conservative approximation (increases epoch)
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-    // then actually try to set it atomically
-    return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx);
+    const bool ok = mi_bchunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx);
+    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
+    return ok;
   }
   else {
-    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
-    bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx);
-    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return cleared;
+    bool maybe_all_clear;
+    const bool ok = mi_bchunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return ok;
   }
 }
 
-
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
   mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
   if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false;
 
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
   if (set) {
-    // first set the chunkmap since it is a conservative approximation (increases epoch)
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-    // then actually try to set it atomically
-    return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n);
+    const bool ok = mi_bchunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n);
+    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
+    return ok;
   }
   else {
-    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
-    bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n);
-    if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return cleared;
+    bool maybe_all_clear;
+    const bool ok = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return ok;
   }
 }
 
 mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS);
   if (n==1) return mi_bitmap_try_xset(set, bitmap, idx);
   if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx);
-  // todo: add 32/64 for large pages
+  // todo: add 32/64 for large pages ?
   return mi_bitmap_try_xsetN_(set, bitmap, idx, n);
 }
 
-// Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal((idx%2)==0);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
 
+// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (set) {
-    // first set the chunkmap since it is a conservative approximation (increases epoch)
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-    // then actually try to set it atomically
-    return mi_bitmap_chunk_set2(&bitmap->chunks[chunk_idx], cidx, NULL);
+    const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+    return wasclear;
   }
   else {
-    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
-    bool already_clear = false;
-    const bool allset = mi_bitmap_chunk_clear2(&bitmap->chunks[chunk_idx], cidx, &already_clear);
-    if (!already_clear && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return allset;
+    bool maybe_all_clear;
+    const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return wasset;
   }
 }
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
   mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
 
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
 
   if (set) {
-    // first set the chunkmap since it is a conservative approximation (increases epoch)
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-    // then actually try to set it atomically
-    return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
+    const bool allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
+    mi_bitmap_chunkmap_set(bitmap,chunk_idx);   // set afterwards
+    return allclear;
   }
   else {
-    const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx);
     size_t already_clear = 0;
-    const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear);
+    const bool allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear );
     if (already_xset != NULL) { *already_xset = already_clear; }
-    if (already_clear < n && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
+    if (already_clear < n) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
     return allset;
   }
 }
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
-  mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS);
   //TODO: specialize?
   //if (n==1) return mi_bitmap_xset(set, bitmap, idx);
   //if (n==2) return mi_bitmap_xset(set, bitmap, idx);
@@ -1061,82 +1007,52 @@ bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, s
 }
 
 
-// Is a sequence of 2 bits already all set/cleared?
-static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx);
-}
-
-
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
   mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
 
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
 
-  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
+  return mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
 }
 
 
 /* --------------------------------------------------------------------------------
   bitmap try_find_and_clear
 -------------------------------------------------------------------------------- */
-static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) {
-  size_t hi_chunk_map_idx = 0;
-  mi_cmap_t hi_cmap = 0;
-  for (size_t i = 1; i < mi_bitmap_chunk_map_count(bitmap); i++) {
-    mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, i, NULL);
-    if (cmap != 0) {
-      hi_chunk_map_idx = i;
-      hi_cmap = cmap;
-    }
-  }
-  uint32_t cmap_idx;
-  if (mi_bsr32(hi_cmap, &cmap_idx)) {
-    const size_t hi = (hi_chunk_map_idx * MI_CHUNKMAP_BITS) + cmap_idx;
-    mi_assert_internal(hi < mi_bitmap_chunk_count(bitmap));
-    return hi;
-  }
-  else {
-    return 0;
-  }
-}
+
 
 #define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
   const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \
-  const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \
-  const uint32_t chunk_map_start_idx = (uint32_t)(chunk_start % MI_CHUNKMAP_BITS); \
+  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
+  const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
+  const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
   /* for each chunkmap entry `i` */ \
-  for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \
-    size_t i = (_i + chunk_map_start); \
-    if (i >= bitmap->chunk_map_count) { i -= bitmap->chunk_map_count; } /* adjust for the start position */ \
+  for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
+    size_t i = (_i + chunkmap_start); \
+    if (i >= chunkmap_max_bfield) { i -= chunkmap_max_bfield; } /* adjust for the start position */ \
     \
-    const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \
-    mi_epoch_t name_epoch; \
-    mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \
-    uint32_t cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); cmap_idx_shift = chunk_map_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
+    const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
+    size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
+    if (_i == 0) { cmap = mi_rotr(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
     \
-    uint32_t cmap_idx;             /* one bit set of each chunk that may have bits set */ \
-    while (mi_bsf32(cmap, &cmap_idx)) {     /* find least bit that is set */ \
+    size_t cmap_idx; \
+    while (mi_bsf(cmap, &cmap_idx)) {     /* find least bit that is set */ \
       /* set the chunk idx */ \
-      size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_CHUNKMAP_BITS); \
-      if (name_chunk_idx >= mi_bitmap_chunk_count(bitmap)) { name_chunk_idx -= mi_bitmap_chunk_count(bitmap); } \
+      size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
+      mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
       /* try to find and clear N bits in that chunk */ \
-      if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) {   /* we can have less chunks than in the chunkmap.. */ 
+      {
 
 #define mi_bitmap_forall_chunks_end() \
       } \
@@ -1146,7 +1062,7 @@ static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) {
       cmap >>= 1; \
     } \
   }}
-   
+
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
@@ -1154,17 +1070,15 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
   mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
   {
     size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+    if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+      *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
       mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
       return true;
     }
     else {
       // we may find that all are cleared only on a second iteration but that is ok as
       // the chunkmap is a conservative approximation.
-      if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-      }
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
       // continue
     }
   }
@@ -1172,183 +1086,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
   return false;
 }
 
-/* --------------------------------------------------------------------------------
-  pairmap
--------------------------------------------------------------------------------- */
 
-void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) {
-  mi_assert_internal(mi_bitmap_chunk_count(bm1)==mi_bitmap_chunk_count(bm2));
-  pairmap->bitmap1 = bm1;
-  pairmap->bitmap2 = bm2;
-}
-
-static void mi_pairmap_from_pair_idx(mi_pairmap_t* pairmap, size_t pair_idx, mi_bitmap_t** bitmap, size_t* pidx) {
-  const size_t idx = 2*pair_idx;
-  const size_t maxbits = mi_bitmap_max_bits(pairmap->bitmap1);
-  mi_assert_internal(pair_idx < maxbits);
-  if (idx < maxbits) {
-    *bitmap = pairmap->bitmap1;
-    *pidx = idx;
-  }
-  else {
-    *bitmap = pairmap->bitmap2;
-    *pidx = idx - maxbits;
-  }
-}
-
-bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx) {
-  mi_bitmap_t* bitmap;
-  size_t idx;
-  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  return mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx);
-}
-
-bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
-  mi_bitmap_t* bitmap;
-  size_t idx;
-  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  return mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx);
-}
-
-bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
-  mi_bitmap_t* bitmap;
-  size_t idx;
-  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  return mi_bitmap_is_xset2(MI_BIT_CLEAR, bitmap, idx);
-}
-
-
-
-/* --------------------------------------------------------------------------------
-  pairmap clear while not busy
--------------------------------------------------------------------------------- */
-
-static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
-  const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
-  mi_bfield_t bnew;
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
-  do {
-    if mi_unlikely((old&mask)==mask_busy) {
-      old = mi_atomic_load_acquire(b);
-      if ((old&mask)==mask_busy) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
-      while ((old&mask)==mask_busy) {  // busy wait
-        mi_atomic_yield();
-        old = mi_atomic_load_acquire(b);
-      }
-    }
-    bnew = (old & ~mask);  // clear
-  } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew));
-  mi_assert_internal((old&mask) != mask_busy);  // we should never clear a busy page
-  mi_assert_internal((old&mask) == mask); // in our case: we should only go from set to clear (when reclaiming an abandoned page from a free)
-  return ((old&mask) == mask);
-}
-
-static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx);
-}
-
-static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal((idx%2)==0);
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
-  bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx);
-  if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-  }
-  return cleared;
-}
-
-void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
-  mi_bitmap_t* bitmap;
-  size_t idx;
-  mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  mi_bitmap_clear2_once_not_busy(bitmap, idx);
-}
-
-
-
-/* --------------------------------------------------------------------------------
-  pairmap try and set busy
--------------------------------------------------------------------------------- */
-
-// Atomically go from set to busy, or return false otherwise and leave the bit field as-is.
-static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
-  mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
-  const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
-  mi_bfield_t old;
-  mi_bfield_t bnew;
-  do {
-    old = mi_atomic_load_relaxed(b);
-    if ((old & mask) != mask) return false;  // no longer set
-    bnew = (old & ~mask) | mask_busy;
-  } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew));
-  return true;
-}
-
-static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    while (true) {
-      const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits
-      size_t idx;
-      if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-        break; // not found: continue with the next field
-      }
-      else {
-        mi_assert_internal((idx%2)==0);
-        if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
-          *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
-          return true;
-        }
-        // else: try this word once again
-      }
-    }
-  }
-  return false;
-}
-
-
-static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx,
-                                            mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) 
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+                                                    mi_claim_fun_t* claim, void* arg1, void* arg2)
 {
   mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
   {
-    MI_UNUSED(epoch); MI_UNUSED(n);
-    mi_assert_internal(n==2);
     size_t cidx;
-    if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
-      const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal((idx%2)==0);
-      const size_t pair_idx = (idx + idx_offset)/2;
-      if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page
-        mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry
-        *ppair_idx = pair_idx;
+    if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+      const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+      mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
+      bool keep_set = true;
+      if ((*claim)(slice_index, arg1, arg2, &keep_set)) {
+        // success!
+        mi_assert_internal(!keep_set);
+        *pidx = slice_index;
         return true;
       }
       else {
-        mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry
-        // and continue
+        // failed to claim it, set abandoned mapping again (unless thet page was freed)
+        if (keep_set) {
+          const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+          mi_assert_internal(wasclear); MI_UNUSED(wasclear);
+        }        
+        // continue
       }
     }
+    else {
+      // we may find that all are cleared only on a second iteration but that is ok as
+      // the chunkmap is a conservative approximation.
+      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+      // continue
+    }
   }
   mi_bitmap_forall_chunks_end();
   return false;
 }
 
-// Used to find an abandoned page, and transition from set to busy.
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, 
-                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) {
-  if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true;
-  return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2);  
-}
+// Clear a bit once it is set.
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
+}
\ No newline at end of file
diff --git a/src/bitmap.h b/src/bitmap.h
index 78ee5380..9ef97d2f 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -19,35 +19,34 @@ Concurrent bitmap that can set/reset sequences of bits atomically
       each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
       We need 16K bits to represent a 1GiB arena.
 
-  `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512)
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
       allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
       of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
-      These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions
+      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
       to scan for bits (perhaps) more efficiently.
 
-   `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set.
+   `mi_bchunkmap_t` == `mi_bchunk_t`: for each chunk we track if it has (potentially) any bit set.
       The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
       This is used to avoid scanning every chunk. (and thus strictly an optimization)
       It is conservative: it is fine to a bit in the chunk map even if the chunk turns out
-      to have no bits set.
+      to have no bits set. It is also allowed to briefly have a clear bit even if the
+      chunk has bits set, as long as we guarantee that we set the bit later on -- this
+      allows us to set the chunkmap bit after we set a bit in the corresponding chunk.
 
-      When we (potentially) set a bit in a chunk, we first update the chunkmap.
       However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
       cannot safely clear the bit corresponding to the chunk in the chunkmap since it
-      may race with another thread setting a bit in the same chunk (and we may clear the
-      bit even though a bit is set in the chunk which is not allowed).
+      may race with another thread setting a bit in the same chunk. Therefore, when
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
+      then test again to catch any set bits that we missed.
 
-      To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch"
-      counter that is increased everytime a bit is set. We only clear a bit if the epoch
-      stayed the same over our clear operation (so we know no other thread in the mean
-      time set a bit in any of the chunks corresponding to the chunkmap).
-      Since increasing the epoch and setting a bit must be atomic, we use only half-word
-      bits (32) (we could use 128-bit atomics if needed since modern hardware supports this)
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
+      not find a free page even though it's there (but we accept this as we avoid taking
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
+      for now).
 
-   `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16)
-      and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB
-      The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but
-      each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's).
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
+      The minimum is 1 chunk which is a 32 MiB arena.
 
    For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
    and pop-count (but we think it can be adapted work reasonably well on older hardware too)
@@ -56,60 +55,49 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 // A word-size bit field.
 typedef size_t mi_bfield_t;
 
-#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
-#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
-#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
-#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
-#define MI_BFIELD_LO_BIT8                  (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
-#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
+#define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
 
-#define MI_BITMAP_CHUNK_SIZE               (MI_BITMAP_CHUNK_BITS / 8)
-#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
-#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
-
-// A bitmap chunk contains 512 bits of bfields on 64_bit  (256 on 32-bit)
-typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s {
-  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
-} mi_bitmap_chunk_t;
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit
 
 
-// for now 32-bit epoch + 32-bit bit-set   (note: with ABA instructions we can double this)
-typedef uint64_t mi_chunkmap_t;
-typedef uint32_t mi_epoch_t;
-typedef uint32_t mi_cmap_t;
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
+} mi_bchunk_t;
 
 
-#define MI_CHUNKMAP_BITS            (32)   // 1 chunkmap tracks 32 chunks
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
+// The chunkmap is itself a chunk.
+typedef mi_bchunk_t mi_bchunkmap_t;
 
-#define MI_BITMAP_MAX_CHUNKMAPS     (16)
-#define MI_BITMAP_MAX_CHUNK_COUNT   (MI_BITMAP_MAX_CHUNKMAPS * MI_CHUNKMAP_BITS)
-#define MI_BITMAP_MIN_CHUNK_COUNT   (1 * MI_CHUNKMAP_BITS)                              // 1 GiB arena
+#define MI_BCHUNKMAP_BITS            MI_BCHUNK_BITS
 
-#define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  // 16 GiB arena
-#define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  //  1 GiB arena
+#define MI_BITMAP_MAX_CHUNK_COUNT   (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT   (1)
+#define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
 
 
 // An atomic bitmap
-typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)         chunk_map_count; // valid chunk_maps entries
-  _Atomic(size_t)         chunk_count;     // total count of chunks
-  size_t                  padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
-  _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
-
-  mi_bitmap_chunk_t       chunks[MI_BITMAP_MIN_BIT_COUNT];  // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
+  _Atomic(size_t)  chunk_count;     // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunk_t      chunks[1];       // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
 
-static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) {
-  return mi_atomic_load_relaxed(&bitmap->chunk_map_count);
-}
-
 static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
   return mi_atomic_load_relaxed(&bitmap->chunk_count);
 }
 
 static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
-  return (mi_bitmap_chunk_count(bitmap) * MI_BITMAP_CHUNK_BITS);
+  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
 }
 
 
@@ -134,9 +122,22 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+
+// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
+
+static inline bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_xset(MI_BIT_SET, bitmap, idx);
+}
+
+static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_xset(MI_BIT_CLEAR, bitmap, idx);
+}
+
+
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+// If `already_xset` is not NULL, it is to all the bits were already all set/cleared.
 bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
 
 static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
@@ -162,7 +163,7 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
 // and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 
 static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
@@ -177,48 +178,11 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
 
+typedef bool (mi_claim_fun_t)(size_t slice_index, void* arg1, void* arg2, bool* keep_set);
 
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, 
+                                                    mi_claim_fun_t* claim, void* arg1, void* arg2);
 
-/* --------------------------------------------------------------------------------
-  Atomic bitmap for a pair of bits.
-
-  The valid pairs are CLEAR (0), SET (3), or BUSY (2).
-
-  These bit pairs are used in the abandoned pages maps: when set, the entry has
-  an available page. When we scan for an available abandoned page and find an entry SET,
-  we first set it to BUSY, and try to claim the page atomically (since it can race
-  with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`,
-  we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page
-  just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon`
-  which (busy) waits until the BUSY flag is cleared to ensure all readers are done.
-  (and pair-bit operations must therefore be release_acquire).
--------------------------------------------------------------------------------- */
-
-#define MI_PAIR_CLEAR   (0)
-#define MI_PAIR_UNUSED  (1)   // should never occur
-#define MI_PAIR_BUSY    (2)
-#define MI_PAIR_SET     (3)
-
-// 0b....0101010101010101
-#define MI_BFIELD_LO_BIT2     ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8)
-
-// A pairmap manipulates pairs of bits (and consists of 2 bitmaps)
-typedef struct mi_pairmap_s {
-  mi_bitmap_t* bitmap1;
-  mi_bitmap_t* bitmap2;
-} mi_pairmap_t;
-
-// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true
-void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2);
-bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx);
-bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx);
-bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx);
-void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
-
-typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2);
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx,
-                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2
-                                                       );
-
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 
 #endif // MI_BITMAP_H

From 61436a92b9ec623220a92d1f2c166d39a64067a9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 15:26:01 -0800
Subject: [PATCH 036/264] working simplified version without pairmaps and
 bitmap epoch

---
 src/bitmap.c       | 48 +++++++++++++++++++++++-----------------------
 src/bitmap.h       | 18 +++++++++++------
 src/init.c         |  2 +-
 src/page-map.c     |  6 +++---
 test/test-stress.c |  8 ++++----
 5 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 7df46070..0916aaae 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -42,9 +42,9 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
   return mi_rotr(x,r);
 }
 
-static inline mi_bfield_t mi_bfield_zero(void) {
-  return 0;
-}
+//static inline mi_bfield_t mi_bfield_zero(void) {
+//  return 0;
+//}
 
 static inline mi_bfield_t mi_bfield_one(void) {
   return 1;
@@ -147,10 +147,10 @@ static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t
 
 // Tries to set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 // and otherwise false (leaving the bit unchanged)
-static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference
-}
+//static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+//  mi_assert_internal(idx < MI_BFIELD_BITS);
+//  return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference
+//}
 
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
 // `all_clear` is set to true if the new bfield is zero (and false otherwise)
@@ -237,17 +237,17 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by
 
 // Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's.
 // and false otherwise leaving the bit field as-is.
-static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) {
-  mi_bfield_t old = 0;
-  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set());
-}
+//static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) {
+//  mi_bfield_t old = 0;
+//  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set());
+//}
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
-static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
-  mi_bfield_t old = mi_bfield_all_set();
-  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero());
-}
+//static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
+//  mi_bfield_t old = mi_bfield_all_set();
+//  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero());
+//}
 
 
 // Check if all bits corresponding to a mask are set.
@@ -328,7 +328,7 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size
     const mi_bfield_t mask = mi_bfield_mask(m, idx);
     size_t already_xset = 0;
     const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
-    mi_assert_internal((transition && already_xset == m) || (!transition && already_xset > 0));
+    mi_assert_internal((transition && already_xset == 0) || (!transition && already_xset > 0));
     all_transition = all_transition && transition;
     total_already_xset += already_xset;
     // next field
@@ -605,9 +605,9 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx
   return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
 }
 
-static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) {
-  return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
-}
+//static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) {
+//  return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
+//}
 
 
 // find least byte in a chunk with all bits set, and try unset it atomically
@@ -763,7 +763,7 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   // a 64b cache-line contains the entire chunk anyway so load both at once
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
-  return (mi_mm256_is_zero(_mm256_or_epi64(vec1,vec2)));
+  return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
   #else
   return mi_bchunk_all_are_clear(chunk);
   #endif
@@ -810,7 +810,7 @@ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
   mi_assert_internal(bit_count > 0);
   const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
   mi_assert_internal(chunk_count >= 1);
-  const size_t size = sizeof(mi_bitmap_t) + ((chunk_count - 1) * MI_BCHUNK_SIZE);
+  const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
   mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
   if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
   return size;
@@ -1044,10 +1044,10 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
     size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0) { cmap = mi_rotr(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
+    if (_i == 0) { cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
     \
     size_t cmap_idx; \
-    while (mi_bsf(cmap, &cmap_idx)) {     /* find least bit that is set */ \
+    while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
       /* set the chunk idx */ \
       size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
       mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
@@ -1130,4 +1130,4 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
   const size_t cidx = idx % MI_BCHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
-}
\ No newline at end of file
+}
diff --git a/src/bitmap.h b/src/bitmap.h
index 9ef97d2f..7b6000cc 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -75,12 +75,18 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s {
 // The chunkmap is itself a chunk.
 typedef mi_bchunk_t mi_bchunkmap_t;
 
-#define MI_BCHUNKMAP_BITS            MI_BCHUNK_BITS
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS
 
-#define MI_BITMAP_MAX_CHUNK_COUNT   (MI_BCHUNKMAP_BITS)
-#define MI_BITMAP_MIN_CHUNK_COUNT   (1)
-#define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
-#define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT     (1)
+#if MI_SIZE_BITS > 32
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
+#else
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)  
+#endif
+#define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
 
 
 // An atomic bitmap
@@ -88,7 +94,7 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
   _Atomic(size_t)  chunk_count;     // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
   size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
-  mi_bchunk_t      chunks[1];       // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
 
diff --git a/src/init.c b/src/init.c
index 64b31e1b..5d4a775a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->subproc = &mi_subproc_default;
-  tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1);
+  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
 }
 
diff --git a/src/page-map.c b/src/page-map.c
index 25693064..c292378b 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -13,7 +13,7 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static mi_memid_t  mi_page_map_memid;
-static mi_bitmap_t mi_page_map_commit = { 1, MI_BITMAP_MIN_CHUNK_COUNT };
+static mi_bitmap_t mi_page_map_commit = { MI_BITMAP_DEFAULT_CHUNK_COUNT, { 0 }, { 0 }, { { 0 } } };
 
 bool _mi_page_map_init(void) {
   size_t vbits = _mi_os_virtual_address_bits();
@@ -22,10 +22,10 @@ bool _mi_page_map_init(void) {
   //                    64 KiB for 4 GiB address space (on 32-bit)
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
 
-  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT);
+  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT);
   // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
-  mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
diff --git a/test/test-stress.c b/test/test-stress.c
index 61891269..d5f106d5 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -41,11 +41,11 @@ static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
-static int THREADS = 1;
+static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;
 #define ALLOW_LARGE false
-#elif 1
+#elif 0
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
@@ -343,9 +343,9 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  mi_debug_show_arenas(true, true, false);
+  //mi_debug_show_arenas(true, true, false);
   mi_collect(true);
-  // mi_debug_show_arenas(true,true,false);
+  mi_debug_show_arenas(true,true,false);
   #endif
   // mi_stats_print(NULL);
 #else

From 5a5943ad33551c4fcd84d62be1564445f25d52d4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 21:03:33 -0800
Subject: [PATCH 037/264] record max_clear bit

---
 src/arena.c    | 32 ++++++++++++++++++++++++------
 src/bitmap.c   | 53 +++++++++++++++++++++++++++++++++++---------------
 src/bitmap.h   |  5 +++--
 src/page-map.c |  5 ++++-
 4 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index fd609fe0..2c215264 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -476,23 +476,30 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) {
+static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) {
   // found an abandoned page of the right size
   mi_arena_t* const   arena   = (mi_arena_t*)arg1;
   mi_subproc_t* const subproc = (mi_subproc_t*)arg2;
   mi_page_t* const    page    = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
   if (!mi_page_try_claim_ownership(page)) {
+    // there was a concurrent free .. 
+    // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`,
+    // and wait for readers (us!) to finish. This is why it is very important to set the abandoned
+    // bit again (or otherwise the unabandon will never stop waiting).
     *keep_abandoned = true;
     return false;
   }
   if (subproc != page->subproc) {
-    // wrong sub-process.. we need to unown again, and perhaps not keep it abandoned
+    // wrong sub-process.. we need to unown again
+    // (an unown might free the page, and depending on that we can keep it in the abandoned map or not)
+    // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point.
+    //       so we cannot check in `mi_arena_free` for this invariant to hold.
     const bool freed = _mi_page_unown(page);
     *keep_abandoned = !freed;
     return false;
   }
-  // yes, we can reclaim it
+  // yes, we can reclaim it, keep the abandaned map entry clear
   *keep_abandoned = false;
   return true;
 }
@@ -515,7 +522,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
     size_t slice_index;
     mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
 
-    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) {
+    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc)) {
       // found an abandoned page of the right size
       // and claimed ownership.
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
@@ -703,6 +710,9 @@ void _mi_arena_page_free(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
+    // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
+    // be (temporarily) not true if the free happens while trying to reclaim
+    // see `mi_arana_try_claim_abandoned`
   }
   #endif
 
@@ -1087,10 +1097,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
     return ENOMEM;
   }
   _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  // mi_debug_show_arenas(true, true, false);
+
   return 0;
 }
 
-
 // Manage a range of regular OS memory
 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
   return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
@@ -1121,13 +1132,22 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
     char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
+    size_t k = 0;
     mi_bchunk_t* chunk = &bitmap->chunks[i];
-    for (size_t j = 0, k = 0; j < MI_BCHUNK_FIELDS; j++) {
+    
+    if (i<10)  { buf[k++] = ' '; }
+    if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); }
+    buf[k++] = ' ';
+
+    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
         buf[k++] = '\n';
         _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
         buf[k++] = ' ';
         buf[k++] = ' ';
+        buf[k++] = ' ';
+        buf[k++] = ' ';
+        buf[k++] = ' ';
       }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
diff --git a/src/bitmap.c b/src/bitmap.c
index 0916aaae..15401d8d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -87,7 +87,7 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bo
 }
 
 // Clear a bit but only when/once it is set. This is used by concurrent free's while
-// the page is abandoned and mapped. 
+// the page is abandoned and mapped.
 static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
@@ -101,7 +101,7 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
         old = mi_atomic_load_acquire(b);
       }
     }
-  } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));  
+  } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));
   mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
 }
 
@@ -187,10 +187,10 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
   do {
-    if ((old&mask) != mask) {  
+    if ((old&mask) != mask) {
       // the mask bits are no longer set
       if (all_clear != NULL) { *all_clear = (old==0); }
-      return false; 
+      return false;
     }
   } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
   if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); }
@@ -708,19 +708,27 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
   const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
   for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
   {
-    // first pre-scan for a range of fields that are all set
+    // first pre-scan for a range of fields that are all set (up to the last one)
     bool allset = true;
     size_t j = 0;
+    size_t m = n;
     do {
       mi_assert_internal(i + j < MI_BCHUNK_FIELDS);
       mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
-      if (~b != 0) {
-        allset = false;
-        i += j;  // no need to look again at the previous fields
-        break;
+      size_t idx;
+      if (mi_bfield_find_least_bit(~b,&idx)) {
+        if (m > idx) {
+          allset = false;
+          i += j;  // no need to look again at the previous fields
+          break;
+        }
+      }
+      else {
+        // all bits in b were set
+        m -= MI_BFIELD_BITS;  // note: can underflow
       }
     } while (++j < field_count);
-
+    
     // if all set, we can try to atomically clear them
     if (allset) {
       const size_t cidx = i*MI_BFIELD_BITS;
@@ -796,6 +804,11 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
+  // record the max clear 
+  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  do {
+    if mi_likely(chunk_idx <= oldmax) break;
+  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
   return true;
 }
 
@@ -853,6 +866,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
       if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) {
         // optimize: we can set a full bfield in the chunkmap
         mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set());
+        mi_bitmap_chunkmap_set(bitmap, chunk_idx + MI_BFIELD_BITS - 1);  // track the max set
         chunk_idx += MI_BFIELD_BITS;
       }
       else {
@@ -1032,20 +1046,24 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \
+  const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear);  /* mi_bitmap_chunk_count(bitmap) */ \
+  const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */      /* space out threads */ \
   const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
   const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
   const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
   /* for each chunkmap entry `i` */ \
   for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
     size_t i = (_i + chunkmap_start); \
-    if (i >= chunkmap_max_bfield) { i -= chunkmap_max_bfield; } /* adjust for the start position */ \
-    \
+    if (i >= chunkmap_max_bfield) { \
+      i -= chunkmap_max_bfield; /* adjust for the start position */ \
+    } \
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
     size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0) { cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; }   /* rotate right for the start position (on the first iteration) */ \
-    \
+    if (_i == 0) { \
+      cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
+      cmap_idx_shift = chunkmap_start_idx; \
+    } \
     size_t cmap_idx; \
     while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
       /* set the chunk idx */ \
@@ -1065,6 +1083,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+// (Used to find fresh free slices.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
   mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
@@ -1087,6 +1106,8 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 }
 
 
+// Find a set bit in the bitmap and try to atomically clear it and claim it.
+// (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
                                                     mi_claim_fun_t* claim, void* arg1, void* arg2)
 {
@@ -1108,7 +1129,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
         if (keep_set) {
           const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
           mi_assert_internal(wasclear); MI_UNUSED(wasclear);
-        }        
+        }
         // continue
       }
     }
diff --git a/src/bitmap.h b/src/bitmap.h
index 7b6000cc..7938bfa0 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -91,8 +91,9 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)  chunk_count;     // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  _Atomic(size_t)  chunk_count;      // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared 
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
   mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
diff --git a/src/page-map.c b/src/page-map.c
index c292378b..ca0e2481 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -13,7 +13,10 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static mi_memid_t  mi_page_map_memid;
-static mi_bitmap_t mi_page_map_commit = { MI_BITMAP_DEFAULT_CHUNK_COUNT, { 0 }, { 0 }, { { 0 } } };
+
+// (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
+static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
+                                          { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
 
 bool _mi_page_map_init(void) {
   size_t vbits = _mi_os_virtual_address_bits();

From 659a9dd51d1d02b620ea569d62fdda76dcb60c38 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 22:37:59 -0800
Subject: [PATCH 038/264] fix page info size and order; atomic page flags

---
 CMakeLists.txt              |  2 +-
 include/mimalloc/atomic.h   |  4 +-
 include/mimalloc/internal.h | 24 ++++++++---
 include/mimalloc/types.h    | 81 +++++++++++++++----------------------
 src/arena.c                 | 36 ++++++++---------
 src/bitmap.c                |  6 +--
 src/free.c                  |  5 ++-
 src/init.c                  | 10 ++---
 src/os.c                    |  4 +-
 test/test-stress.c          |  4 +-
 10 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c04aea8..1a4cc1f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
       message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
     else()
-      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2)
+      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2)
     endif()
   endif()
   if(MI_OVERRIDE)
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 3a0d4892..caa90cf8 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -80,10 +80,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 
 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index d9c2cd6e..ad7c41c6 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -667,7 +667,8 @@ static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
 
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || 
+          (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
 }
 
 
@@ -727,20 +728,33 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 //-----------------------------------------------------------
 // Page flags
 //-----------------------------------------------------------
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return mi_atomic_load_acquire(&page->xflags);
+}
+
+static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  if (set) {
+    mi_atomic_or_acq_rel(&page->xflags, newflag);
+  }
+  else {
+    mi_atomic_and_acq_rel(&page->xflags, ~newflag);
+  }
+}
+
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.x.in_full;
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
 }
 
 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.x.in_full = in_full;
+  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
 }
 
 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.x.has_aligned;
+  return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0);
 }
 
 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.x.has_aligned = has_aligned;
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
 }
 
 /* -------------------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d78dbc59..5dfbb808 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -111,17 +111,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Sizes are for 64-bit
 #ifndef MI_ARENA_SLICE_SHIFT
-#ifdef  MI_SMALL_PAGE_SHIFT  // compatibility
+#ifdef  MI_SMALL_PAGE_SHIFT   // compatibility
 #define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
 #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
 #endif
 #ifndef MI_BCHUNK_BITS_SHIFT
-#define MI_BCHUNK_BITS_SHIFT        (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
+#define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
 
-#define MI_BCHUNK_BITS              (1 << MI_BCHUNK_BITS_SHIFT)
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 
@@ -167,8 +167,8 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 
 typedef struct mi_memid_os_info {
   void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
   size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
@@ -224,26 +224,11 @@ typedef enum mi_owned_e {
 } mi_owned_t;
 
 
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint32_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
+// The `in_full` and `has_aligned` page flags are put in the same field
+// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#define MI_PAGE_IN_FULL_QUEUE  MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED    MI_ZU(0x02)
+typedef size_t mi_page_flags_t;
 
 // Thread free list.
 // We use the bottom bit of the pointer for `mi_owned_t` flags
@@ -280,35 +265,33 @@ typedef struct mi_subproc_s mi_subproc_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
 
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
-  uint16_t              reserved;          // number of blocks reserved in memory
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t                   retire_expire;     // expiration count for retired blocks
 
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
-
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the blocks
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
+  _Atomic(mi_page_flags_t)  xflags;            // `in_full` and `has_aligned` flags 
 
+  size_t                    block_size;        // size available in each block (always `>0`)  
+  uint8_t*                  page_start;        // start of the blocks
+  uint8_t                   heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+                                               // padding
   #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-
-  mi_heap_t*            heap;              // heap this threads belong to.
-  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-  mi_subproc_t*         subproc;           // sub-process of this heap
-  mi_memid_t            memid;             // provenance of the page memory
+  mi_heap_t*                heap;              // heap this threads belong to.
+  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
+  mi_subproc_t*             subproc;           // sub-process of this heap
+  mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
 
@@ -317,10 +300,10 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
-#define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
+#define MI_PAGE_MIN_BLOCK_ALIGN           (64)                 // minimal block alignment in a page 
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
-#if MI_DEBUG && MI_SIZE_SIZE == 8
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
 #else
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
diff --git a/src/arena.c b/src/arena.c
index 2c215264..45697081 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -483,7 +483,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a
   mi_page_t* const    page    = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
   if (!mi_page_try_claim_ownership(page)) {
-    // there was a concurrent free .. 
+    // there was a concurrent free ..
     // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`,
     // and wait for readers (us!) to finish. This is why it is very important to set the abandoned
     // bit again (or otherwise the unabandon will never stop waiting).
@@ -596,7 +596,9 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     }
   }
   #endif
-  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
+  if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
+    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n");
+  };
   const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
@@ -1126,28 +1128,22 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
-  _mi_output_message("%s%s:\n", prefix, header);
+static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
+  _mi_output_message("%s:\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
     char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     size_t k = 0;
     mi_bchunk_t* chunk = &bitmap->chunks[i];
-    
-    if (i<10)  { buf[k++] = ' '; }
-    if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); }
-    buf[k++] = ' ';
 
+    if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
+    else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
+    else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
+    
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
-        buf[k++] = '\n';
-        _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
+        buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
       }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
@@ -1164,9 +1160,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
       }
       bit_count += MI_BFIELD_BITS;
     }
-    _mi_output_message("%s  %s\n", prefix, buf);
+    _mi_output_message("  %s\n", buf);
   }
-  _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  _mi_output_message("  total ('x'): %zu\n", bit_set_count);
   return bit_set_count;
 }
 
@@ -1183,12 +1179,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, arena->slices_free, true);
+      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true);
     }
-    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, arena->slices_committed, false);
+    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false);
     // todo: abandoned slices
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, arena->slices_purge, false);
+      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
diff --git a/src/bitmap.c b/src/bitmap.c
index 15401d8d..2ef692cb 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     return false;
   }
   // record the max clear 
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
   do {
     if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
+  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/
   return true;
 }
 
@@ -1046,7 +1046,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear);  /* mi_bitmap_chunk_count(bitmap) */ \
+  /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \
   const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */      /* space out threads */ \
   const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
   const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
diff --git a/src/free.c b/src/free.c
index 0ff4bf60..afb23838 100644
--- a/src/free.c
+++ b/src/free.c
@@ -163,8 +163,9 @@ void mi_free(void* p) mi_attr_noexcept
   if mi_unlikely(page==NULL) return;
 
   const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
+  const mi_page_flags_t flags = mi_page_flags(page);
   if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
       mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
@@ -176,7 +177,7 @@ void mi_free(void* p) mi_attr_noexcept
   }
   else {
     // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-    if mi_likely(page->flags.full_aligned == 0) {
+    if mi_likely(flags == 0) {
       // blocks are aligned (and not a full page)
       mi_block_t* const block = (mi_block_t*)p;
       mi_free_block_mt(page,block);
diff --git a/src/init.c b/src/init.c
index 5d4a775a..4fbd50ed 100644
--- a/src/init.c
+++ b/src/init.c
@@ -20,21 +20,21 @@ const mi_page_t _mi_page_empty = {
   0,       // capacity
   0,       // reserved capacity
   0,       // block size shift
-  0,       // heap tag
-  { 0 },   // flags
-  false,   // is_zero
   0,       // retire_expire
   NULL,    // local_free
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xflags
   0,       // block_size
   NULL,    // page_start
+  0,       // heap tag
+  false,   // is_zero
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
   NULL,       // xheap
   NULL, NULL, // next, prev
   NULL,       // subproc
-  { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE }  // memid
+  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
diff --git a/src/os.c b/src/os.c
index c7f464c0..156a655b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -128,7 +128,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     // different base? (due to alignment)
     if (memid.mem.os.base != base) {
       mi_assert(memid.mem.os.base <= addr);
-      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+      // mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
       base = memid.mem.os.base;
       if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); }
     }
@@ -305,7 +305,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (p != NULL) {
     *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
-    memid->mem.os.alignment = alignment;
+    // memid->mem.os.alignment = alignment;
     memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
   return p;
diff --git a/test/test-stress.c b/test/test-stress.c
index d5f106d5..d46c2484 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 0
+#elif 1
 static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;
@@ -347,6 +347,8 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,true,false);
   #endif
+  mi_collect(true);
+  mi_debug_show_arenas(true, true, false);
   // mi_stats_print(NULL);
 #else
   mi_stats_print(NULL);  // so we see rss/commit/elapsed

From bf9a2ddb59778dc11a39e380f9b7ba49c9f34ecb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 23:07:10 -0800
Subject: [PATCH 039/264] compile for 32-bit as well

---
 include/mimalloc/types.h |  2 +-
 src/bitmap.c             | 23 +++++++++++++----------
 src/page-map.c           |  4 ++--
 test/main-override.cpp   |  2 +-
 test/test-stress.c       |  2 +-
 5 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 5dfbb808..ba9a8864 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -300,7 +300,7 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
-#define MI_PAGE_MIN_BLOCK_ALIGN           (64)                 // minimal block alignment in a page 
+#define MI_PAGE_MIN_BLOCK_ALIGN           MI_SIZE_BITS         // minimal block alignment in a page (64b on 64-bit, 32b on 32-bit)
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
 #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
diff --git a/src/bitmap.c b/src/bitmap.c
index 2ef692cb..7f4c8776 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     return false;
   }
   // record the max clear 
-  /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
   do {
     if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/
+  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
   return true;
 }
 
@@ -1042,21 +1042,23 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 -------------------------------------------------------------------------------- */
 
 
-#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \
+#define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \
-  const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */      /* space out threads */ \
+  const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
   const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
+  const size_t chunkmap_hi_bfield  = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
   const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
   const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
   /* for each chunkmap entry `i` */ \
   for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
-    size_t i = (_i + chunkmap_start); \
-    if (i >= chunkmap_max_bfield) { \
-      i -= chunkmap_max_bfield; /* adjust for the start position */ \
+    size_t i; \
+    if (_i < chunkmap_hi_bfield) { \
+      i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \
+      if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \
     } \
+    else { i = _i;  }  /* the rest of the chunks above chunk_hi_idx */ \
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
     size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
@@ -1086,7 +1088,8 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 // (Used to find fresh free slices.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
-  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
@@ -1111,7 +1114,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
                                                     mi_claim_fun_t* claim, void* arg1, void* arg2)
 {
-  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) {
diff --git a/src/page-map.c b/src/page-map.c
index ca0e2481..d849e6a2 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -43,9 +43,9 @@ bool _mi_page_map_init(void) {
     bool is_zero;
     _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL);
     if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); }
-    _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
-    mi_assert_internal(_mi_ptr_page(NULL)==NULL);
   }
+  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
   return true;
 }
 
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 3f64117a..5a1fc6d2 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -382,7 +382,7 @@ static void test_mt_shutdown()
 
 // issue #372
 static void fail_aslr() {
-  size_t sz = (4ULL << 40); // 4TiB
+  uint64_t sz = (4ULL << 40); // 4TiB
   void* p = malloc(sz);
   printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz);
   *(int*)0x5FFFFFFF000 = 0;  // should segfault
diff --git a/test/test-stress.c b/test/test-stress.c
index d46c2484..19edf2b5 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 1
+#elif 0
 static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;

From 70115d8b8c0e52d8f196622901639fffed41ff9c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 23:25:53 -0800
Subject: [PATCH 040/264] small fixes

---
 include/mimalloc/internal.h | 41 +++++---------------
 src/arena.c                 |  8 ++--
 src/free.c                  | 12 +++---
 src/heap.c                  | 15 +-------
 src/os.c                    |  6 +--
 src/page-queue.c            | 26 ++++---------
 src/page.c                  | 77 ++-----------------------------------
 7 files changed, 34 insertions(+), 151 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ad7c41c6..28eca4bb 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -540,30 +540,16 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
 
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   if (heap != NULL) {
-    // mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
     page->heap = heap;
     page->heap_tag = heap->tag;
     mi_atomic_store_release(&page->xthread_id, heap->thread_id);
   }
   else {
-    // mi_atomic_store_release(&page->xheap, (uintptr_t)heap->tld->subproc);
     page->heap = NULL;
     mi_atomic_store_release(&page->xthread_id,0);
   }
 }
 
-//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-//  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-//  if (heap != NULL) {
-//    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-//    page->heap_tag = heap->tag;
-//    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
-//  }
-//  else {
-//    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
-//    mi_atomic_store_release(&page->xthread_id,0);
-//  }
-//}
 
 // Thread free flag helpers
 static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
@@ -650,24 +636,24 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_atomic_load_acquire(&page->xthread_id) <= 1);
+  return (mi_atomic_load_relaxed(&page->xthread_id) <= 1);
 }
 
 static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
-  return (mi_atomic_load_acquire(&page->xthread_id) == 1);
+  return (mi_atomic_load_relaxed(&page->xthread_id) == 1);
 }
 
 static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
-  mi_atomic_or_acq_rel(&page->xthread_id, (uintptr_t)1);
+  mi_atomic_or_relaxed(&page->xthread_id, (uintptr_t)1);
 }
 
 static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
-  mi_atomic_and_acq_rel(&page->xthread_id, ~(uintptr_t)1);
+  mi_atomic_and_relaxed(&page->xthread_id, ~(uintptr_t)1);
 }
 
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || 
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
           (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
 }
 
@@ -683,15 +669,6 @@ static inline void _mi_page_unown_unconditional(mi_page_t* page) {
   mi_assert_internal(mi_page_thread_id(page)==0);
   const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1));
   mi_assert_internal((old&1)==1); MI_UNUSED(old);
-  /*
-  mi_thread_free_t tf_new;
-  mi_thread_free_t tf_old;
-  do {
-    tf_old = mi_atomic_load_relaxed(&page->xthread_free);
-    mi_assert_internal(mi_tf_is_owned(tf_old));
-    tf_new = mi_tf_create(mi_tf_block(tf_old), false);
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
-  */
 }
 
 
@@ -721,7 +698,7 @@ static inline bool _mi_page_unown(mi_page_t* page) {
     }
     mi_assert_internal(mi_tf_block(tf_old)==NULL);
     tf_new = mi_tf_create(NULL, false);
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new));
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
   return false;
 }
 
@@ -729,15 +706,15 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 // Page flags
 //-----------------------------------------------------------
 static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
-  return mi_atomic_load_acquire(&page->xflags);
+  return mi_atomic_load_relaxed(&page->xflags);
 }
 
 static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
   if (set) {
-    mi_atomic_or_acq_rel(&page->xflags, newflag);
+    mi_atomic_or_relaxed(&page->xflags, newflag);
   }
   else {
-    mi_atomic_and_acq_rel(&page->xflags, ~newflag);
+    mi_atomic_and_relaxed(&page->xflags, ~newflag);
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index 45697081..8362a31f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -944,7 +944,7 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
 bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
       return true;
     }
@@ -1140,7 +1140,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
     if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
     else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
     else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
-    
+
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
         buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
@@ -1174,7 +1174,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
   //size_t abandoned_total = 0;
   size_t purge_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
@@ -1324,7 +1324,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if (max_arena == 0) return;
 
   // _mi_error_message(EFAULT, "purging not yet implemented\n");
diff --git a/src/free.c b/src/free.c
index afb23838..ece55599 100644
--- a/src/free.c
+++ b/src/free.c
@@ -70,7 +70,7 @@ static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block)
   do {
     mi_block_set_next(page, block, mi_tf_block(tf_old));
     tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
 
   // and atomically try to collect the page if it was abandoned
   const bool is_owned_now = !mi_tf_is_owned(tf_old);
@@ -207,17 +207,17 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   #endif
 
   // 1. free if the page is free now
-  if (mi_page_all_free(page)) 
+  if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-    _mi_arena_page_unabandon(page); 
+    _mi_arena_page_unabandon(page);
     // we can free the page directly
     _mi_arena_page_free(page);
     return;
   }
-    
+
   // 2. if the page is not too full, we can try to reclaim it for ourselves
-  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && 
+  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
       !mi_page_is_used_at_frac(page,8))
   {
     // the page has still some blocks in use (but not too many)
@@ -234,7 +234,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
           (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
           (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
-      {        
+      {
         if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
           _mi_arena_page_unabandon(page);
diff --git a/src/heap.c b/src/heap.c
index 2ff40930..d687f25e 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -136,24 +136,11 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     _mi_arena_reclaim_all_abandoned(heap);
   }
 
-  // if abandoning, mark all pages to no longer add to delayed_free
-  //if (collect == MI_ABANDON) {
-  //  mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
-  //}
-
-  // free all current thread delayed blocks.
-  // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  // _mi_heap_delayed_free_all(heap);
-
   // collect retired pages
   _mi_heap_collect_retired(heap, force);
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  // mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
-
-  // collect segments (purge pages, this can be expensive so don't force on abandonment)
-  // _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
 
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && is_main_thread && mi_heap_is_backing(heap)) {
@@ -219,7 +206,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
     if (poolData != NULL) {
       heap->no_reclaim = true;
     }
-  }  
+  }
   #endif
 
   if (heap == tld->heap_backing) {
diff --git a/src/os.c b/src/os.c
index 156a655b..b05068fd 100644
--- a/src/os.c
+++ b/src/os.c
@@ -113,8 +113,8 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  if (still_committed) { 
-    _mi_stat_decrease(&stats->committed, size); 
+  if (still_committed) {
+    _mi_stat_decrease(&stats->committed, size);
   }
   _mi_stat_decrease(&stats->reserved, size);
 }
@@ -556,7 +556,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     #endif
     }
     end = start + size;
-  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
diff --git a/src/page-queue.c b/src/page-queue.c
index ad616b1d..9e3aaacc 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"     
+#include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@@ -83,10 +83,10 @@ static inline uint8_t mi_bin(size_t size) {
     #if defined(MI_ALIGN4W)
     if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
     #endif
-    wsize--; 
+    wsize--;
     mi_assert_internal(wsize!=0);
     // find the highest bit position
-    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));    
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
     // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
@@ -211,8 +211,8 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
-                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
@@ -227,7 +227,6 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   heap->page_count--;
   page->next = NULL;
   page->prev = NULL;
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
@@ -243,7 +242,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -346,8 +345,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
       page->prev = to->first;
       page->next = next;
       to->first->next = page;
-      if (next != NULL) { 
-        next->prev = page; 
+      if (next != NULL) {
+        next->prev = page;
       }
       else {
         to->last = page;
@@ -385,15 +384,6 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   // set append pages to new heap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    /*
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
-    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
-    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
-    // that after appending only the new heap will be used for delayed free operations.
-    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
-    */
     mi_page_set_heap(page, heap);
     count++;
   }
diff --git a/src/page.c b/src/page.c
index 056c9506..54e7b539 100644
--- a/src/page.c
+++ b/src/page.c
@@ -132,40 +132,6 @@ bool _mi_page_is_valid(mi_page_t* page) {
 }
 #endif
 
-/*
-void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
-    mi_atomic_yield();
-  }
-}
-
-bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfreex;
-  mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;
-  size_t yield_count = 0;
-  do {
-    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
-    tfreex = mi_tf_set_delayed(tfree, delay);
-    old_delay = mi_tf_delayed(tfree);
-    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
-      if (yield_count >= 4) return false;  // give up after 4 tries
-      yield_count++;
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
-      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
-    }
-    else if (delay == old_delay) {
-      break; // avoid atomic operation if already equal
-    }
-    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never-delayed flag set
-    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  return true; // success
-}
-*/
 
 /* -----------------------------------------------------------
   Page collect the `local_free` and `thread_free` lists
@@ -181,7 +147,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
     head = mi_tf_block(tfree);
     if (head == NULL) return; // return if the list is empty
     tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
   mi_assert_internal(head != NULL);
 
   // find the tail -- also to get a proper count (without data races)
@@ -334,43 +300,6 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   return page;
 }
 
-/* -----------------------------------------------------------
-   Do any delayed frees
-   (put there by other threads if they deallocated in a full page)
------------------------------------------------------------ */
-/*
-void _mi_heap_delayed_free_all(mi_heap_t* heap) {
-  while (!_mi_heap_delayed_free_partial(heap)) {
-    mi_atomic_yield();
-  }
-}
-
-// returns true if all delayed frees were processed
-bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) {  };
-  bool all_freed = true;
-
-  // and free them all
-  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
-    // use internal free instead of regular one to keep stats etc correct
-    if (!_mi_free_delayed_block(block)) {
-      // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
-      // into the delayed free list
-      all_freed = false;
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-    block = next;
-  }
-  return all_freed;
-}
-*/
 
 /* -----------------------------------------------------------
   Unfull, abandon, free and retire
@@ -765,7 +694,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   #if MI_STAT
   size_t count = 0;
   #endif
-  long candidate_limit = 0;          // we reset this on the first candidate to limit the search  
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
   long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain);
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
@@ -777,7 +706,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     count++;
     #endif
     candidate_limit--;
-    
+
     // collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
 

From 9631b0d4d2259c2bc2cf9808b40c444cee7ea3f2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 7 Dec 2024 14:03:51 -0800
Subject: [PATCH 041/264] revise visiting arenas, better bitmap scanning

---
 src/arena.c  |  83 ++++++++++++-----------
 src/bitmap.c | 188 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 164 insertions(+), 107 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 8362a31f..8b9ab4da 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -202,20 +202,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
   // set the dirty bits
   if (arena->memid.initially_zero) {
-    // size_t dirty_count = 0;
-    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);
-    //if (dirty_count>0) {
-    //  if (memid->initially_zero) {
-    //    _mi_error_message(EFAULT, "ouch1\n");
-    //  }
-    //  // memid->initially_zero = false;
-    //}
-    //else {
-    //  if (!memid->initially_zero) {
-    //    _mi_error_message(EFAULT, "ouch2\n");
-    //  }
-    //  // memid->initially_zero = true;
-    //}
+    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);    
   }
 
   // set commit state
@@ -235,7 +222,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
         #if MI_DEBUG > 1
         if (memid->initially_zero) {
           if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) {
-            _mi_error_message(EFAULT, "arena allocation was not zero-initialized!\n");
+            _mi_error_message(EFAULT, "interal error: arena allocation was not zero-initialized!\n");
             memid->initially_zero = false;
           }
         }
@@ -327,31 +314,47 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
   return true;
 }
 
-#define MI_THREADS_PER_ARENA  (16)
 
-#define mi_forall_arenas(req_arena_id, allow_large, tseq, var_arena_id, var_arena) \
+#define mi_forall_arenas(req_arena_id, tseq, name_arena) \
   { \
-  size_t _max_arena; \
-  size_t _start; \
-  if (req_arena_id == _mi_arena_id_none()) { \
-    _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \
-    _start = (_max_arena <= 2 ? 0 : (tseq % (_max_arena-1))); \
-  } \
-  else { \
-    _max_arena = 1; \
-    _start = mi_arena_id_index(req_arena_id); \
-    mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \
-  } \
-  for (size_t i = 0; i < _max_arena; i++) { \
-    size_t _idx = i + _start; \
-    if (_idx >= _max_arena) { _idx -= _max_arena; } \
-    const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\
-    mi_arena_t* const   var_arena = mi_arena_from_index(_idx); \
-    if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,-1 /* todo: numa node */,allow_large)) \
-    {
+  const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
+  if (_arena_count > 0) { \
+    const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
+    size_t _start; \
+    if (req_arena_id == _mi_arena_id_none()) { \
+       /* always start searching in an arena 1 below the max */ \
+      _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
+    } \
+    else { \
+      _start = mi_arena_id_index(req_arena_id); \
+      mi_assert_internal(_start < _arena_count); \
+    } \
+    for (size_t _i = 0; _i < _arena_count; _i++) { \
+      size_t _idx; \
+      if (_i < _arena_cycle) { \
+        _idx = _i + _start; \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
+      } \
+      else { \
+        _idx = _i; \
+      } \
+      mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
+      if (name_arena != NULL) \
+      {
 
-#define mi_forall_arenas_end()  }}}
+#define mi_forall_arenas_end()  \
+      } \
+      if (req_arena_id != _mi_arena_id_none()) break; \
+    } \
+  }}
 
+#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \
+  mi_forall_arenas(req_arena_id,tseq,name_arena) { \
+    if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \
+
+#define mi_forall_suitable_arenas_end() \
+  }} \
+  mi_forall_arenas_end()
 
 /* -----------------------------------------------------------
   Arena allocation
@@ -369,12 +372,12 @@ static mi_decl_noinline void* mi_arena_try_find_free(
 
   // search arena's
   const size_t tseq = tld->tseq;
-  mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena)
+  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
   {
     void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
     if (p != NULL) return p;
   }
-  mi_forall_arenas_end();
+  mi_forall_suitable_arenas_end();
   return NULL;
 }
 
@@ -517,7 +520,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   // search arena's
   const bool allow_large = true;
   size_t tseq = tld->tseq;
-  mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena)
+  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
   {
     size_t slice_index;
     mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
@@ -545,7 +548,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       return page;
     }
   }
-  mi_forall_arenas_end();
+  mi_forall_suitable_arenas_end();
   return NULL;
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index 7f4c8776..fb8468fa 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -42,9 +42,9 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
   return mi_rotr(x,r);
 }
 
-//static inline mi_bfield_t mi_bfield_zero(void) {
-//  return 0;
-//}
+static inline mi_bfield_t mi_bfield_zero(void) {
+  return 0;
+}
 
 static inline mi_bfield_t mi_bfield_one(void) {
   return 1;
@@ -64,9 +64,9 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
 // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
 // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) {
-  return mi_bfield_find_least_bit((set ? ~x : x), idx);
-}
+//static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) {
+//  return mi_bfield_find_least_bit((set ? ~x : x), idx);
+//}
 
 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
@@ -244,10 +244,10 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
-//static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
-//  mi_bfield_t old = mi_bfield_all_set();
-//  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero());
-//}
+static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
+  mi_bfield_t old = mi_bfield_all_set();
+  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero());
+}
 
 
 // Check if all bits corresponding to a mask are set.
@@ -514,31 +514,33 @@ static inline __m256i mi_mm256_zero(void) {
 static inline __m256i mi_mm256_ones(void) {
   return _mm256_set1_epi64x(~0);
 }
-static inline bool mi_mm256_is_ones(__m256i vec) {
-  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
-}
+//static inline bool mi_mm256_is_ones(__m256i vec) {
+//  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
+//}
 static inline bool mi_mm256_is_zero( __m256i vec) {
   return _mm256_testz_si256(vec,vec);
 }
 #endif
 
-// find least 0/1-bit in a chunk and try to set/clear it atomically
+// Find least 1-bit in a chunk and try to clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
-static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t* pidx) {
-#if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { 
+  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
     const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
-    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
     size_t cidx;
-    if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
-      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
+    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
+      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
         *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
         mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
@@ -546,39 +548,42 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
     }
     // try again
   }
-#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     size_t chunk_idx = 0;
-    #if 1
+    #if 0
+    // one vector at a time
     __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    if ((set ? mi_mm256_is_ones(vec) : mi_mm256_is_zero(vec))) {
+    if (mi_mm256_is_zero(vec)) {
       chunk_idx += 4;
       vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
     }
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
     const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
-    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     chunk_idx += _tzcnt_u32(mask) / 8;
     #else
+    // a cache line is 64b so we can just as well load all at the same time
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vec2  = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
-    const __m256i cmpv  = (set ? mi_mm256_ones() : mi_mm256_zero());
-    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
-    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF  : 0)
+    const __m256i cmpv  = mi_mm256_zero();
+    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF  : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF  : 0)
     const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
-    const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
     const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
-    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
-    const size_t chunk_idx = _tzcnt_u64(mask) / 8;
+    chunk_idx = _tzcnt_u64(mask) / 8;
     #endif
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
     size_t cidx;
-    if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) {           // find the bit-idx that is set/clear
-      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) {  // set/clear it atomically
+    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the bit-idx that is clear
+      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
         *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
         mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
@@ -586,11 +591,12 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
     }
     // try again
   }
-#else
+  #else
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
     size_t idx;
-    if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit
-      if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) {  // try to set it atomically
+    if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) {  // try to clear it atomically
         *pidx = (i*MI_BFIELD_BITS + idx);
         mi_assert_internal(*pidx < MI_BCHUNK_BITS);
         return true;
@@ -598,48 +604,49 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
     }
   }
   return false;
-#endif
+  #endif
 }
 
-static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) {
-  return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
-}
-
-//static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) {
-//  return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
-//}
 
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// Used to find medium size pages in the free blocks.
 // todo: try neon version
 static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
-  while(true) {
-    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1  : 0)
-    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
-    if (mask == 0) return false;
-    const size_t i = _tzcnt_u32(mask);
-    mi_assert_internal(8*i < MI_BCHUNK_BITS);
-    const size_t chunk_idx = i / MI_BFIELD_SIZE;
-    const size_t byte_idx  = i % MI_BFIELD_SIZE;
-    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
-      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
-      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  while (true) {
+    // since a cache-line is 64b, load all at once 
+    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2  = _mm256_load_si256((const __m256i*)chunk->bfields+1);
+    const __m256i cmpv  = mi_mm256_ones();
+    const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const uint32_t mask1 = _mm256_movemask_epi8(vcmp1);    // mask of most significant bit of each byte
+    const uint32_t mask2 = _mm256_movemask_epi8(vcmp2);    // mask of most significant bit of each byte
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
+    const size_t chunk_idx = bidx / 8;          
+    const size_t byte_idx  = bidx % 8;             // byte index of the byte in the bfield
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);    
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
     }
     // try again
   }
   #else
     for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      const mi_bfield_t x = chunk->bfields[i];
+      const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
       // has_set8 has low bit in each byte set if the byte in x == 0xFF
       const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
                                     (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
                                     >> 7;                           // shift high bit to low bit
       size_t idx;
-      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
+      if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
         mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
         mi_assert_internal((idx%8)==0);
         const size_t byte_idx = idx/8;
@@ -656,14 +663,58 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
 }
 
 
+
+// find least bfield in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// Used to find large size pages in the free blocks.
+// todo: try neon version
+static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) {
+#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  while (true) {
+    // since a cache-line is 64b, load all at once 
+    const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
+    const __m256i cmpv = mi_mm256_ones();
+    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (bfield == ~0 ? -1 : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (bfield == ~0 ? -1 : 0)
+    const uint32_t mask1 = _mm256_movemask_epi8(vcmp1);    // mask of most significant bit of each byte
+    const uint32_t mask2 = _mm256_movemask_epi8(vcmp2);    // mask of most significant bit of each byte
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each 8-bits are set iff the corresponding elem64 has all bits set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    const size_t chunk_idx = _tzcnt_u64(mask) / 8;
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) {
+      *pidx = chunk_idx*MI_BFIELD_BITS;
+      mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
+      return true;
+    }    
+    // try again
+  }
+#else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i])) {
+      *pidx = i*MI_BFIELD_BITS;
+      mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+#endif
+}
+
+
 // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// (We do not cross bfield boundaries)
 static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
   for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    mi_bfield_t b = chunk->bfields[i];
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
     size_t bshift = 0;
     size_t idx;
     while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
@@ -680,8 +731,9 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
           return true;
         }
         else {
-          // if failed to atomically commit, try again from this position
-          b = (chunk->bfields[i] >> bshift);
+          // if failed to atomically commit, reload b and try again from this position
+          bshift -= idx;
+          b = mi_atomic_load_relaxed(&chunk->bfields[i]) >> bshift;
         }
       }
       else {
@@ -699,11 +751,11 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
 // find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set,
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// This can cross bfield boundaries.
 static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  // if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
-
-  // we align an a field, and require `field_count` fields to be all clear.
+  
+  // we align at a bfield, and scan `field_count` fields 
   // n >= MI_BFIELD_BITS; find a first field that is 0
   const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
   for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
@@ -740,14 +792,16 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
         return true;
       }
     }
+    // continue
   }
   return false;
 }
 
 
 static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx);
-  if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx);
+  if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx);         // small pages
+  if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx);        // medium pages
+  if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx);  // large pages
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
   if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
   return mi_bchunk_find_and_try_clearN_(chunk, n, pidx);

From 6b52b19e3b6bd28eb61739ef0a21297993940b28 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 7 Dec 2024 15:02:27 -0800
Subject: [PATCH 042/264] arch specific optimizations

---
 CMakeLists.txt              |  2 +-
 ide/vs2022/mimalloc.vcxproj |  1 -
 src/bitmap.c                | 57 ++++++++++++++++++-------------------
 3 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89dad3b5..b1f66f5c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -385,7 +385,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
       message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
     else()
-      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2)
+      list(APPEND mi_cflags -ftls-model=initial-exec)
     endif()
   endif()
   if(MI_OVERRIDE)
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index d03fd281..e9a4a339 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -120,7 +120,6 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <PostBuildEvent>
       <Command>
diff --git a/src/bitmap.c b/src/bitmap.c
index fb8468fa..8479555c 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -505,7 +505,7 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
   mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
 }
 
-// ------ find_and_try_xset --------
+// ------ try_find_and_clear --------
 
 #if defined(__AVX2__)
 static inline __m256i mi_mm256_zero(void) {
@@ -526,7 +526,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
-static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { 
+static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
   #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -613,10 +613,10 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
-static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
   #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
-    // since a cache-line is 64b, load all at once 
+    // since a cache-line is 64b, load all at once
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vec2  = _mm256_load_si256((const __m256i*)chunk->bfields+1);
     const __m256i cmpv  = mi_mm256_ones();
@@ -628,9 +628,9 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
     // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
     if (mask==0) return false;
     const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
-    const size_t chunk_idx = bidx / 8;          
+    const size_t chunk_idx = bidx / 8;
     const size_t byte_idx  = bidx % 8;             // byte index of the byte in the bfield
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);    
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
     if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // clear it atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
@@ -668,10 +668,10 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find large size pages in the free blocks.
 // todo: try neon version
-static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
-    // since a cache-line is 64b, load all at once 
+    // since a cache-line is 64b, load all at once
     const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
     const __m256i cmpv = mi_mm256_ones();
@@ -689,7 +689,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
       *pidx = chunk_idx*MI_BFIELD_BITS;
       mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
       return true;
-    }    
+    }
     // try again
   }
 #else
@@ -710,7 +710,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
 // (We do not cross bfield boundaries)
-static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
   for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -752,10 +752,10 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
 // This can cross bfield boundaries.
-static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  
-  // we align at a bfield, and scan `field_count` fields 
+
+  // we align at a bfield, and scan `field_count` fields
   // n >= MI_BFIELD_BITS; find a first field that is 0
   const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
   for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
@@ -780,7 +780,7 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
         m -= MI_BFIELD_BITS;  // note: can underflow
       }
     } while (++j < field_count);
-    
+
     // if all set, we can try to atomically clear them
     if (allset) {
       const size_t cidx = i*MI_BFIELD_BITS;
@@ -798,13 +798,13 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
 }
 
 
-static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx);         // small pages
-  if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx);        // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx);  // large pages
+static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
+  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
+  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
-  return mi_bchunk_find_and_try_clearN_(chunk, n, pidx);
+  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
 }
 
 
@@ -858,7 +858,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
-  // record the max clear 
+  // record the max clear
   size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
   do {
     if mi_likely(chunk_idx <= oldmax) break;
@@ -1139,23 +1139,22 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-// (Used to find fresh free slices.)
+// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
   // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
   mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
   {
     size_t cidx;
-    if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+    if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
       *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+      mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
       return true;
     }
     else {
       // we may find that all are cleared only on a second iteration but that is ok as
       // the chunkmap is a conservative approximation.
       mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-      // continue
     }
   }
   mi_bitmap_forall_chunks_end();
@@ -1171,7 +1170,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
   mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
   {
     size_t cidx;
-    if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+    if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
       const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
       mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
       bool keep_set = true;
@@ -1182,19 +1181,17 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
         return true;
       }
       else {
-        // failed to claim it, set abandoned mapping again (unless thet page was freed)
+        // failed to claim it, set abandoned mapping again (unless the page was freed)
         if (keep_set) {
           const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
           mi_assert_internal(wasclear); MI_UNUSED(wasclear);
         }
-        // continue
       }
     }
     else {
       // we may find that all are cleared only on a second iteration but that is ok as
       // the chunkmap is a conservative approximation.
       mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-      // continue
     }
   }
   mi_bitmap_forall_chunks_end();

From bf42759d976bd965eacd8a0b4c13c6dc9e6182d9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 7 Dec 2024 15:13:17 -0800
Subject: [PATCH 043/264] check heaptag on abandonded page allocation

---
 include/mimalloc/types.h | 13 ++++++++++++-
 src/arena.c              | 17 ++++++++---------
 src/bitmap.c             |  4 ++--
 src/bitmap.h             |  4 ++--
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ba9a8864..d883ec52 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -237,6 +237,8 @@ typedef uintptr_t mi_thread_free_t;
 // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
 typedef struct mi_subproc_s mi_subproc_t;
 
+// A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
+typedef uint8_t mi_heaptag_t;
 
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
@@ -280,7 +282,7 @@ typedef struct mi_page_s {
 
   size_t                    block_size;        // size available in each block (always `>0`)  
   uint8_t*                  page_start;        // start of the blocks
-  uint8_t                   heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  mi_heaptag_t              heap_tag;          // tag of the owning heap, used to separate heaps by object type
   bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
                                                // padding
   #if (MI_ENCODE_FREELIST || MI_PADDING)
@@ -411,7 +413,16 @@ struct mi_heap_s {
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
 
+// ------------------------------------------------------
+// Arena's
+// These are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_SLICE_SIZE slices of memory for the 
+// mimalloc pages.
+// ------------------------------------------------------
 
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;
 
 // ------------------------------------------------------
 // Debug
diff --git a/src/arena.c b/src/arena.c
index 8b9ab4da..f6c0f0a3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -479,11 +479,9 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) {
+static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_abandoned) {
   // found an abandoned page of the right size
-  mi_arena_t* const   arena   = (mi_arena_t*)arg1;
-  mi_subproc_t* const subproc = (mi_subproc_t*)arg2;
-  mi_page_t* const    page    = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  mi_page_t* const page  = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
   if (!mi_page_try_claim_ownership(page)) {
     // there was a concurrent free ..
@@ -493,8 +491,9 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a
     *keep_abandoned = true;
     return false;
   }
-  if (subproc != page->subproc) {
-    // wrong sub-process.. we need to unown again
+  if (subproc != page->subproc || heap_tag != page->heap_tag) {
+    // wrong sub-process or heap_tag.. we need to unown again
+    // note: this normally never happens unless subprocesses/heaptags are actually used.
     // (an unown might free the page, and depending on that we can keep it in the abandoned map or not)
     // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point.
     //       so we cannot check in `mi_arena_free` for this invariant to hold.
@@ -507,7 +506,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a
   return true;
 }
 
-static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_heaptag_t heaptag, mi_tld_t* tld)
 {
   MI_UNUSED(slice_count);
   const size_t bin = _mi_bin(block_size);
@@ -525,7 +524,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
     size_t slice_index;
     mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
 
-    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc)) {
+    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc, heaptag)) {
       // found an abandoned page of the right size
       // and claimed ownership.
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
@@ -632,7 +631,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
   mi_tld_t* const tld = heap->tld;
 
   // 1. look for an abandoned page
-  mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, tld);
+  mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, heap->tag, tld);
   if (page != NULL) {
     return page;  // return as abandoned
   }
diff --git a/src/bitmap.c b/src/bitmap.c
index 8479555c..cdeeb009 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1165,7 +1165,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-                                                    mi_claim_fun_t* claim, void* arg1, void* arg2)
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag )
 {
   mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
   {
@@ -1174,7 +1174,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
       const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
       mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
       bool keep_set = true;
-      if ((*claim)(slice_index, arg1, arg2, &keep_set)) {
+      if ((*claim)(slice_index, arena, subproc, heap_tag, &keep_set)) {
         // success!
         mi_assert_internal(!keep_set);
         *pidx = slice_index;
diff --git a/src/bitmap.h b/src/bitmap.h
index 7938bfa0..aaa552ad 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -185,10 +185,10 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
 
-typedef bool (mi_claim_fun_t)(size_t slice_index, void* arg1, void* arg2, bool* keep_set);
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set);
 
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, 
-                                                    mi_claim_fun_t* claim, void* arg1, void* arg2);
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag );
 
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 

From d0c86f3f0e625236da685c9668378657cc8e79ba Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 7 Dec 2024 16:26:07 -0800
Subject: [PATCH 044/264] specialize bitmap operations for common page sizes

---
 src/bitmap.c   | 372 ++++++++++++++++++++-----------------------------
 src/bitmap.h   |  38 ++---
 src/page-map.c |   8 +-
 3 files changed, 175 insertions(+), 243 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index cdeeb009..b76dfc77 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -27,10 +27,6 @@ static inline size_t mi_bfield_popcount(mi_bfield_t x) {
   return mi_popcount(x);
 }
 
-//static inline size_t mi_bfield_clz(mi_bfield_t x) {
-//  return mi_clz(x);
-//}
-
 // find the least significant bit that is set (i.e. count trailing zero's)
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
@@ -55,18 +51,13 @@ static inline mi_bfield_t mi_bfield_all_set(void) {
 }
 
 static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
+  mi_assert_internal(bit_count > 0);
   mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
   const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set());
   return (mask0 << shiftl);
 }
 
-
-// Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
-// return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-//static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) {
-//  return mi_bfield_find_least_bit((set ? ~x : x), idx);
-//}
+// ------- mi_bfield_atomic_set ---------------------------------------
 
 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
@@ -105,15 +96,6 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
   mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
 }
 
-// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bfield_atomic_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  if (set) {
-    return mi_bfield_atomic_set(b, idx);
-  }
-  else {
-    return mi_bfield_atomic_clear(b, idx, NULL);
-  }
-}
 
 // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
 static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
@@ -144,13 +126,33 @@ static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t
   }
 }
 
+static inline bool mi_bfield_atomic_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_xset_mask(MI_BIT_SET, b, mask, NULL);
+}
+
+static inline bool mi_bfield_atomic_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {};  // try to atomically clear the mask bits until success
+  if (all_clear!=NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
+}
+
+static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b) {
+  const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set());
+  return (old==0);
+}
+
+static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b) {
+  const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
+  return (~old==0);
+}
+
+// ------- mi_bfield_atomic_try_xset ---------------------------------------
 
-// Tries to set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
-// and otherwise false (leaving the bit unchanged)
-//static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) {
-//  mi_assert_internal(idx < MI_BFIELD_BITS);
-//  return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference
-//}
 
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
 // `all_clear` is set to true if the new bfield is zero (and false otherwise)
@@ -162,14 +164,6 @@ static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx
   return ((old&mask) == mask);
 }
 
-// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
-static inline bool mi_bfield_atomic_try_xset( mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
-  return mi_bfield_atomic_xset(set, b, idx);
-}
-
-
 // Tries to  set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
 // and false otherwise (leaving the bit field as is).
 static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
@@ -211,13 +205,6 @@ static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfie
   }
 }
 
-// Tries to set a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF
-// and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_set_mask(b, mask);
-}
 
 // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
 static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) {
@@ -226,22 +213,6 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
 
-//// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-//// and false otherwise (leaving the bit field as is).
-//static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) {
-//  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-//  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-//  return mi_bfield_atomic_try_xset_mask(set, b, mask);
-//}
-
-
-// Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's.
-// and false otherwise leaving the bit field as-is.
-//static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) {
-//  mi_bfield_t old = 0;
-//  return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set());
-//}
-
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
 static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
@@ -250,6 +221,9 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
 }
 
 
+// ------- mi_bfield_atomic_is_set ---------------------------------------
+
+
 // Check if all bits corresponding to a mask are set.
 static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
@@ -275,26 +249,12 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfiel
 }
 
 
-// Check if a bit is set/clear
-// static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-//   mi_assert_internal(idx < MI_BFIELD_BITS);
-//   const mi_bfield_t mask = mi_bfield_one()<<idx;
-//   return mi_bfield_atomic_is_xset_mask(set, b, mask);
-// }
-
 
 /* --------------------------------------------------------------------------------
  bitmap chunks
 -------------------------------------------------------------------------------- */
 
-// ------ xset --------
-
-//static inline bool mi_bchunk_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) {
-//  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-//  const size_t i = cidx / MI_BFIELD_BITS;
-//  const size_t idx = cidx % MI_BFIELD_BITS;
-//  return mi_bfield_atomic_xset(set, &chunk->bfields[i], idx);
-//}
+// ------- mi_bchunk_xset ---------------------------------------
 
 static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
@@ -310,6 +270,30 @@ static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_
   return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear);
 }
 
+static inline bool mi_bchunk_set8(mi_bchunk_t* chunk, size_t byte_idx) {
+  mi_assert_internal(byte_idx < MI_BCHUNK_SIZE);
+  const size_t i    = byte_idx / MI_BFIELD_SIZE;
+  const size_t bidx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_set8(&chunk->bfields[i], bidx);
+}
+
+static inline bool mi_bchunk_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) {
+  mi_assert_internal(byte_idx < MI_BCHUNK_SIZE);
+  const size_t i = byte_idx / MI_BFIELD_SIZE;
+  const size_t bidx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_clear8(&chunk->bfields[i], bidx, maybe_all_clear);
+}
+
+static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t field_idx) {
+  mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+  return mi_bfield_atomic_setX(&chunk->bfields[field_idx]);
+}
+
+static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t field_idx, bool* maybe_all_clear) {
+  mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+  if (maybe_all_clear != NULL) { *maybe_all_clear = true; }
+  return mi_bfield_atomic_clearX(&chunk->bfields[field_idx]);
+}
 
 // Set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
@@ -340,7 +324,6 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size
   return all_transition;
 }
 
-
 static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
   return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set);
 }
@@ -351,74 +334,46 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, s
 
 
 
-// ------ is_xset --------
+// ------- mi_bchunk_is_xset ---------------------------------------
 
 // Check if a sequence of `n` bits within a chunk are all set/cleared.
-static bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
-  mi_assert_internal(n>0);
-  size_t idx = cidx % MI_BFIELD_BITS;
-  size_t field = cidx / MI_BFIELD_BITS;
+// This can cross bfield's
+mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
   while (n > 0) {
     size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
     if (m > n) { m = n; }
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
     const size_t mask = mi_bfield_mask(m, idx);
-    if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) {
+    if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) {
       return false;
     }
     // next field
-    field++;
+    field_idx++;
     idx = 0;
     n -= m;
   }
   return true;
 }
 
-
-// ------ try_xset --------
-
-static inline bool mi_bchunk_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx);
-}
-
-static inline bool mi_bchunk_try_set(mi_bchunk_t* chunk, size_t cidx) {
-  return mi_bchunk_try_xset(MI_BIT_SET, chunk, cidx);
-}
-
-static inline bool mi_bchunk_try_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, maybe_all_clear);
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  size_t field = cidx / MI_BFIELD_BITS;
+  size_t idx = cidx % MI_BFIELD_BITS;
+  if mi_likely(n<=MI_BFIELD_BITS) {
+    return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mi_bfield_mask(n, idx));
+  }
+  else {
+    return mi_bchunk_is_xsetN_(set, chunk, field, idx, n);
+  }
 }
 
 
-//static inline bool mi_bchunk_try_xset8(mi_xset_t set, mi_bchunk_t* chunk, size_t byte_idx) {
-//  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
-//  const size_t i = byte_idx / MI_BFIELD_SIZE;
-//  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-//  return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx);
-//}
-
-static inline bool mi_bchunk_try_set8(mi_bchunk_t* chunk, size_t byte_idx) {
-  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
-  const size_t i = byte_idx / MI_BFIELD_SIZE;
-  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_set8(&chunk->bfields[i], ibyte_idx);
-}
-
-static inline bool mi_bchunk_try_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) {
-  mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS);
-  const size_t i = byte_idx / MI_BFIELD_SIZE;
-  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_clear8(&chunk->bfields[i], ibyte_idx, maybe_all_clear);
-}
-
+// ------- mi_bchunk_try_xset ---------------------------------------
 
 // Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
@@ -490,22 +445,16 @@ restore:
   return false;
 }
 
-static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
-  return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL);
-}
+// static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
+//   return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL);
+// }
 
 static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
   return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear);
 }
 
-static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
-}
 
-// ------ try_find_and_clear --------
+// ------- mi_bchunk_try_find_and_clear ---------------------------------------
 
 #if defined(__AVX2__)
 static inline __m256i mi_mm256_zero(void) {
@@ -808,6 +757,18 @@ static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, s
 }
 
 
+// ------- mi_bchunk_clear_once_set ---------------------------------------
+
+static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
+}
+
+
+// ------- mi_bitmap_all_are_clear ---------------------------------------
+
 // are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads)
 static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
   for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -831,12 +792,6 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   #endif
 }
 
-
-/* --------------------------------------------------------------------------------
-  chunkmap
--------------------------------------------------------------------------------- */
-
-
 /* --------------------------------------------------------------------------------
  bitmap chunkmap
 -------------------------------------------------------------------------------- */
@@ -866,6 +821,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
   return true;
 }
 
+
 /* --------------------------------------------------------------------------------
  bitmap
 -------------------------------------------------------------------------------- */
@@ -941,82 +897,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 }
 
 
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
-// and false otherwise leaving the bitmask as is.
-static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t cidx = idx % MI_BCHUNK_BITS;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (set) {
-    const bool ok = mi_bchunk_try_set(&bitmap->chunks[chunk_idx], cidx);
-    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
-    return ok;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool ok = mi_bchunk_try_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return ok;
-  }
-}
 
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  mi_assert_internal(idx%8 == 0);
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t byte_idx  = (idx % MI_BCHUNK_BITS)/8;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (set) {
-    const bool ok = mi_bchunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx);
-    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
-    return ok;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool ok = mi_bchunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return ok;
-  }
-}
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BCHUNK_BITS);
-  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
-  if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false;
-
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t cidx = idx % MI_BCHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
-  if (set) {
-    const bool ok = mi_bchunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n);
-    if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); }  // set afterwards
-    return ok;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool ok = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return ok;
-  }
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS);
-  if (n==1) return mi_bitmap_try_xset(set, bitmap, idx);
-  if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx);
-  // todo: add 32/64 for large pages ?
-  return mi_bitmap_try_xsetN_(set, bitmap, idx, n);
-}
 
+// ------- mi_bitmap_xset ---------------------------------------
 
 // Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
 bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
@@ -1037,6 +920,48 @@ bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
   }
 }
 
+// Set/clear aligned 8-bits in the bitmap (with `(idx%8)==0`). 
+// Returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bitmap_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  mi_assert_internal((idx%8)==0);
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  if (set) {
+    const bool wasclear = mi_bchunk_set8(&bitmap->chunks[chunk_idx], byte_idx);
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+    return wasclear;
+  }
+  else {
+    bool maybe_all_clear;
+    const bool wasset = mi_bchunk_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return wasset;
+  }
+}
+
+// Set/clear a field of bits.
+// Returns `true` if atomically transitioned from 0 to ~0 (or ~0 to 0)
+static bool mi_bitmap_xsetX(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  mi_assert_internal((idx%MI_BFIELD_BITS)==0);
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t field_idx = (idx % MI_BCHUNK_BITS)/MI_BFIELD_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  if (set) {
+    const bool wasclear = mi_bchunk_setX(&bitmap->chunks[chunk_idx],field_idx);
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+    return wasclear;
+  }
+  else {
+    bool maybe_all_clear;
+    const bool wasset = mi_bchunk_clearX(&bitmap->chunks[chunk_idx], field_idx, &maybe_all_clear);
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    return wasset;
+  }
+}
+
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
@@ -1067,14 +992,15 @@ static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, siz
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
   mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS);
-  //TODO: specialize?
-  //if (n==1) return mi_bitmap_xset(set, bitmap, idx);
-  //if (n==2) return mi_bitmap_xset(set, bitmap, idx);
-  //if (n==8) return mi_bitmap_xset8(set, bitmap, idx);
+  if (n==1) return mi_bitmap_xset(set, bitmap, idx);
+  if (n==8) return mi_bitmap_xset8(set, bitmap, idx);
+  if (n==MI_BFIELD_BITS) return mi_bitmap_xsetX(set, bitmap, idx);
   return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset);
 }
 
 
+// ------- mi_bitmap_is_xset ---------------------------------------
+
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
@@ -1091,10 +1017,11 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 
-/* --------------------------------------------------------------------------------
-  bitmap try_find_and_clear
--------------------------------------------------------------------------------- */
 
+/* --------------------------------------------------------------------------------
+  bitmap  try_find_and_clear
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
 
 #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \
   { \
@@ -1116,7 +1043,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
     size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0) { \
+    if (_i == 0 && chunkmap_start_idx > 0) { \
       cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
       cmap_idx_shift = chunkmap_start_idx; \
     } \
@@ -1162,6 +1089,11 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 }
 
 
+/* --------------------------------------------------------------------------------
+  bitmap  try_find_and_claim
+  (used to allocate abandoned pages)
+-------------------------------------------------------------------------------- */
+
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
diff --git a/src/bitmap.h b/src/bitmap.h
index aaa552ad..7d6d8f97 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -82,7 +82,7 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 #if MI_SIZE_BITS > 32
 #define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
 #else
-#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)  
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
 #endif
 #define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
 #define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
@@ -92,7 +92,7 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
   _Atomic(size_t)  chunk_count;      // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared 
+  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared
   size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
   mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
@@ -126,7 +126,8 @@ size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
 // returns the size of the bitmap.
 size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
 
@@ -144,7 +145,8 @@ static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
 
 // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-// If `already_xset` is not NULL, it is to all the bits were already all set/cleared.
+// If `already_xset` is not NULL, it is set to count of bits were already all set/cleared.
+// (this is used for correct statistics if commiting over a partially committed area)
 bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
 
 static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
@@ -159,6 +161,8 @@ static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
 static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
 }
@@ -168,28 +172,24 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 
-// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n);
-}
-
-static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
-}
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
 
+
+// Called once a bit is cleared to see if the memory slice can be claimed.
 typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set);
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, 
+// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
+// If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
                                                     mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag );
 
+
+// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
+// This is used to delay free-ing a page that it at the same time being considered to be
+// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 
 #endif // MI_BITMAP_H
diff --git a/src/page-map.c b/src/page-map.c
index d849e6a2..7a00d172 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -55,14 +55,14 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
     const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
     const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit;
     for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
-      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) {
+      if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) {
         // this may race, in which case we do multiple commits (which is ok)
         bool is_zero;
         uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
         const size_t   size = mi_page_map_entries_per_commit_bit;
-        _mi_os_commit(start, size, &is_zero, NULL);        
+        _mi_os_commit(start, size, &is_zero, NULL);
         if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); }
-        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL);
+        mi_bitmap_set(&mi_page_map_commit, i);
       }
     }
     #if MI_DEBUG > 0
@@ -119,7 +119,7 @@ void _mi_page_map_unregister(mi_page_t* page) {
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
-  if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
+  if (!mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
   else {

From c33de86da35b23cebf0dbadea10ac9316a2441b4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 7 Dec 2024 17:11:11 -0800
Subject: [PATCH 045/264] check for running in a threadpool to disable page
 reclaim

---
 include/mimalloc/prim.h    |  3 ++-
 include/mimalloc/types.h   | 19 ++++++++++---------
 src/arena.c                | 12 ++++++++----
 src/bitmap.h               |  2 +-
 src/free.c                 |  2 +-
 src/heap.c                 | 30 +++++++++++-------------------
 src/init.c                 | 17 ++++++++++-------
 src/prim/emscripten/prim.c |  5 ++++-
 src/prim/unix/prim.c       |  4 ++++
 src/prim/wasi/prim.c       |  4 ++++
 src/prim/windows/prim.c    | 25 +++++++++++++++++++++++--
 test/test-stress.c         |  4 ++--
 12 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 8a627438..65f65376 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -117,7 +117,8 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
-
+// Is this thread part of a thread pool?
+bool _mi_prim_thread_is_in_threadpool(void);
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d883ec52..e10786a0 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -399,7 +399,7 @@ struct mi_heap_s {
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  bool                  allow_page_reclaim;                  // `true` if this heap can reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
   #if MI_GUARDED
@@ -568,14 +568,15 @@ typedef struct mi_os_tld_s {
 
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_subproc_t*       subproc;       // sub-process this thread belongs to.
-  size_t              tseq;          // thread sequence id
-  mi_os_tld_t         os;            // os tld
-  mi_stats_t          stats;         // statistics
+  unsigned long long  heartbeat;        // monotonic heartbeat count
+  mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
+  size_t              tseq;             // thread sequence id
+  bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
+  bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_os_tld_t         os;               // os tld
+  mi_stats_t          stats;            // statistics
 };
 
 #endif
diff --git a/src/arena.c b/src/arena.c
index f6c0f0a3..fa7d53ed 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -585,21 +585,25 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
-  // claimed free slices: initialize the page partly
+  // claimed free slices: initialize the page partly  
   if (!memid.initially_zero) {
+    mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE);
     _mi_memzero_aligned(page, sizeof(*page));
   }
-  #if MI_DEBUG > 1
   else {
+    mi_track_mem_defined(page, slice_count * MI_ARENA_SLICE_SIZE);
+  }
+  #if MI_DEBUG > 1
+  if (memid.initially_zero) {
     if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) {
-      _mi_error_message(EFAULT, "page memory was not zero initialized!\n");
+      _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
       memid.initially_zero = false;
       _mi_memzero_aligned(page, sizeof(*page));
     }
   }
   #endif
   if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
-    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n");
+    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n");
   };
   const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
diff --git a/src/bitmap.h b/src/bitmap.h
index 7d6d8f97..40c4df42 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
diff --git a/src/free.c b/src/free.c
index ece55599..d45507e7 100644
--- a/src/free.c
+++ b/src/free.c
@@ -230,7 +230,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
     {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
       if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-          (!tagheap->no_reclaim) &&                    // we are allowed to reclaim abandoned pages
+          (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
           (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
           (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
diff --git a/src/heap.c b/src/heap.c
index d687f25e..3bf8b976 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -128,7 +128,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   #else
       collect >= MI_FORCE
   #endif
-    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
+    && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim)
   {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
@@ -192,23 +192,14 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
   heap->tld = tld;
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
-  heap->no_reclaim = noreclaim;
+  heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->tag        = tag;
-
-  #if defined(WIN32) && (MI_ARCH_X64 || MI_ARCH_X86)
-  // disallow reclaim for threads running in the windows threadpool
-  const DWORD winVersion = GetVersion();
-  const DWORD winMajorVersion = (DWORD)(LOBYTE(LOWORD(winVersion)));
-  if (winMajorVersion >= 6) {
-    _TEB* const teb = NtCurrentTeb();
-    void* const poolData = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
-    if (poolData != NULL) {
-      heap->no_reclaim = true;
-    }
+  if (tld->is_in_threadpool) {
+    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
+    // (but abandoning is good in this case)
+    heap->allow_page_reclaim = false;
   }
-  #endif
-
   if (heap == tld->heap_backing) {
     _mi_random_init(&heap->random);
   }
@@ -364,7 +355,8 @@ static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_he
 void mi_heap_destroy(mi_heap_t* heap) {
   mi_assert(heap != NULL);
   mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
+  mi_assert(!heap->allow_page_reclaim);
+  mi_assert(!heap->allow_page_abandon);
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
   #if MI_GUARDED
@@ -372,9 +364,9 @@ void mi_heap_destroy(mi_heap_t* heap) {
   mi_heap_delete(heap);
   return;
   #else
-  if (!heap->no_reclaim) {
+  if (heap->allow_page_reclaim) {
     _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap);
-    // don't free in case it may contain reclaimed pages
+    // don't free in case it may contain reclaimed pages,
     mi_heap_delete(heap);
   }
   else {
@@ -395,7 +387,7 @@ void _mi_heap_unsafe_destroy_all(void) {
   mi_heap_t* curr = bheap->tld->heaps;
   while (curr != NULL) {
     mi_heap_t* next = curr->next;
-    if (curr->no_reclaim) {
+    if (!curr->allow_page_reclaim) {
       mi_heap_destroy(curr);
     }
     else {
diff --git a/src/init.c b/src/init.c
index 4fbd50ed..b66efc69 100644
--- a/src/init.c
+++ b/src/init.c
@@ -131,12 +131,14 @@ extern mi_heap_t _mi_heap_main;
 static mi_decl_cache_align mi_subproc_t mi_subproc_default;
 
 static mi_decl_cache_align mi_tld_t tld_main = {
-  0, false,
+  0, 
   &_mi_heap_main, &_mi_heap_main,
-  &mi_subproc_default, // subproc
-  0,    // tseq
-  { 0, &tld_main.stats },  // os
-  { MI_STATS_NULL }        // stats
+  &mi_subproc_default,    // subproc
+  0,                      // tseq
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { 0, &tld_main.stats }, // os
+  { MI_STATS_NULL }       // stats
 };
 
 mi_decl_cache_align mi_heap_t _mi_heap_main = {
@@ -150,8 +152,8 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
-  false,            // can reclaim
-  true,             // eager abandon
+  true,             // allow page reclaim
+  true,             // allow page abandon
   0,                // tag
   #if MI_GUARDED
   0, 0, 0, 0, 0,
@@ -402,6 +404,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->subproc = &mi_subproc_default;
   tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
   tld->os.stats = &tld->stats;
+  tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
 }
 
 // Free the thread local default heap (called from `mi_thread_done`)
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index 82147de7..d3dcca93 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -239,6 +239,9 @@ void _mi_prim_thread_done_auto_done(void) {
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   MI_UNUSED(heap);
-
 }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 5a4440c3..e1ca3964 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -886,3 +886,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }
 
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index e1e7de5e..def09985 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -277,3 +277,7 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   MI_UNUSED(heap);
 }
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 276da85c..80522f47 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -17,6 +17,9 @@ terms of the MIT license. A copy of the license can be found in the file
 // Dynamically bind Windows API points for portability
 //---------------------------------------------
 
+static DWORD win_major_version = 6;
+static DWORD win_minor_version = 0;
+
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
@@ -115,6 +118,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   config->has_overcommit = false;
   config->has_partial_free = false;
   config->has_virtual_reserve = true;
+  // windows version
+  const DWORD win_version = GetVersion();
+  win_major_version = (DWORD)(LOBYTE(LOWORD(win_version)));
+  win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version)));
   // get the page size
   SYSTEM_INFO si;
   GetSystemInfo(&si);
@@ -134,7 +141,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
       config->physical_memory = (size_t)(memInKiB * MI_KiB);
     }
-  }
+  }  
   // get the VirtualAlloc2 function
   HINSTANCE  hDll;
   hDll = LoadLibrary(TEXT("kernelbase.dll"));
@@ -809,4 +816,18 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   void _mi_allocator_done(void) {
     mi_allocator_done();
   }
-#endif
\ No newline at end of file
+#endif
+
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  #if (MI_ARCH_X64 || MI_ARCH_X86)
+  if (win_major_version >= 6) {
+    // check if this thread belongs to a windows threadpool
+    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
+    _TEB* const teb = NtCurrentTeb();
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); 
+    return (pool_data != NULL);
+  }
+  #endif
+  return false;
+}
diff --git a/test/test-stress.c b/test/test-stress.c
index 19edf2b5..915c953f 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -347,8 +347,8 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,true,false);
   #endif
-  mi_collect(true);
-  mi_debug_show_arenas(true, true, false);
+  // mi_collect(true);
+  // mi_debug_show_arenas(true, true, false);
   // mi_stats_print(NULL);
 #else
   mi_stats_print(NULL);  // so we see rss/commit/elapsed

From 5a06d2aeba381d47371fcb3189cf24b9ceda2865 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 09:03:25 -0800
Subject: [PATCH 046/264] update bit primitives

---
 include/mimalloc/bits.h | 200 +++++++++++-----------------------------
 src/libc.c              |  75 +++++++++++++--
 src/os.c                |   2 +-
 3 files changed, 122 insertions(+), 155 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index e1951cf7..3afac04d 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -36,6 +36,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #error platform pointers must be 32, 64, or 128 bits
 #endif
 
+#if (INTPTR_MAX) > LONG_MAX
+# define MI_PU(x)  x##ULL
+#else
+# define MI_PU(x)  x##UL
+#endif
+
 #if SIZE_MAX == UINT64_MAX
 # define MI_SIZE_SHIFT (3)
 typedef int64_t  mi_ssize_t;
@@ -43,15 +49,13 @@ typedef int64_t  mi_ssize_t;
 # define MI_SIZE_SHIFT (2)
 typedef int32_t  mi_ssize_t;
 #else
-#error platform objects must be 32 or 64 bits
+#error platform objects must be 32 or 64 bits in size
 #endif
 
 #if (SIZE_MAX/2) > LONG_MAX
 # define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
 #else
 # define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
 #endif
 
 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
@@ -131,11 +135,13 @@ typedef int32_t  mi_ssize_t;
 #endif
 
 #if (MI_SIZE_BITS == 32)
-#define mi_builtin_size(name)       mi_builtin32(name)
-#define mi_has_builtin_size(name)   mi_has_builtin32(name)
+#define mi_builtinz(name)        mi_builtin32(name)
+#define mi_has_builtinz(name)    mi_has_builtin32(name)
+#define mi_msc_builtinz(name)    name
 #elif (MI_SIZE_BITS == 64)
-#define mi_builtin_size(name)       mi_builtin64(name)
-#define mi_has_builtin_size(name)   mi_has_builtin64(name)
+#define mi_builtinz(name)        mi_builtin64(name)
+#define mi_has_builtinz(name)    mi_has_builtin64(name)
+#define mi_msc_builtinz(name)    name##64
 #endif
 
 
@@ -145,91 +151,40 @@ typedef int32_t  mi_ssize_t;
 
 size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
-uint32_t _mi_ctz_generic32(uint32_t x);
 
 static inline size_t mi_ctz(size_t x) {
   #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  // on x64 tzcnt is defined for 0
     uint64_t r;
-    __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+    __asm ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
     return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_tzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
-    #if MI_SIZE_BITS==32
-      return (_BitScanForward(&idx, x) ? (size_t)idx : 32);
-    #else
-      return (_BitScanForward64(&idx, x) ? (size_t)idx : 64);
-    #endif
-  /*
-  // for arm64 and riscv, the builtin_ctz is defined for 0 as well
-  #elif defined(__GNUC__) && MI_ARCH_ARM64
-    uint64_t r;
-    __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc");
-    return r;
-  #elif defined(__GNUC__) && MI_ARCH_RISCV
-    size_t r;
-    __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : );
-    return r;
-  */
-  #elif mi_has_builtin_size(ctz)
-    return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS);
+    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
   #else
     #define MI_HAS_FAST_BITSCAN  0
-    return _mi_ctz_generic(x);
+    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
   #endif
 }
 
 static inline size_t mi_clz(size_t x) {
   #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     uint64_t r;
-    __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+    __asm ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
     return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_lzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
-    #if MI_SIZE_BITS==32
-      return (_BitScanReverse(&idx, x) ? 31 - (size_t)idx : 32);
-    #else
-      return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64);
-    #endif
-  /*
-  // for arm64 and riscv, the builtin_clz is defined for 0 as well
-  #elif defined(__GNUC__) && MI_ARCH_ARM64
-    uint64_t r;
-    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc");
-    return r;
-  #elif defined(__GNUC__) && MI_ARCH_RISCV
-    size_t r;
-    __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : );
-    return r;
-  */
-  #elif mi_has_builtin_size(clz)
-    return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS);
+    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
   #else
     #define MI_HAS_FAST_BITSCAN  0
-    return _mi_clz_generic(x);
-  #endif
-}
-
-static inline uint32_t mi_ctz32(uint32_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
-    uint32_t r;
-    __asm volatile ("tzcntl\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
-    return r;
-  #elif MI_ARCH_X64 && defined(__BMI1__)
-    return (uint32_t)_tzcnt_u32(x);
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    unsigned long idx;
-    return (_BitScanForward(&idx, x) ? (uint32_t)idx : 32);
-  #elif mi_has_builtin(ctz) && (INT_MAX == INT32_MAX)
-    return (x!=0 ? (uint32_t)mi_builtin(ctz)(x) : 32);
-  #elif mi_has_builtin(ctzl) && (LONG_MAX == INT32_MAX)
-    return (x!=0 ? (uint32_t)mi_builtin(ctzl)(x) : 32);
-  #else
-    #define MI_HAS_FAST_BITSCAN  0
-    return _mi_ctz_generic32(x);
+    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
   #endif
 }
 
@@ -237,23 +192,19 @@ static inline uint32_t mi_ctz32(uint32_t x) {
 #define MI_HAS_FAST_BITSCAN 1
 #endif
 
-
+size_t _mi_popcount_generic(size_t x);
 
 static inline size_t mi_popcount(size_t x) {
-#if mi_has_builtin_size(popcount)
-  return mi_builtin_size(popcount)(x);
-#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-  #if MI_SIZE_BITS==32
-  return __popcnt(x);
-  #else
-  return __popcnt64(x);
-  #endif
-#elif MI_ARCH_X64 && defined(__BMI1__)
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_mm_popcnt_u64(x);
-#else
-  #define MI_HAS_FAST_POPCOUNT  0
-  error define generic popcount
-#endif
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
 }
 
 #ifndef MI_HAS_FAST_POPCOUNT
@@ -274,60 +225,31 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
     bool is_zero;
     __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
     return !is_zero;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
-    *idx = mi_ctz(x);
-    return (x!=0);
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);    
   #endif
 }
 
-// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
-// return false if `x==0` (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bsf32(uint32_t x, uint32_t* idx) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
-    // on x64 the carry flag is set on zero which gives better codegen
-    bool is_zero;
-    __asm ("tzcntl\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
-    return !is_zero;
-  #else
-    *idx = mi_ctz32(x);
-    return (x!=0);
-  #endif
-}
-
-
 // Bit scan reverse: find the most significant bit that is set
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsr(size_t x, size_t* idx) {
-  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
+    return !is_zero;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
-    #if MI_SIZE_BITS==32
-      return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
-    #else
-      return (_BitScanReverse64(&i, x) ? (*idx = i, true) : false);
-    #endif
+    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
-    const size_t r = mi_clz(x);
-    *idx = (~r & (MI_SIZE_BITS - 1));
-    return (x!=0);
+    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
   #endif
 }
 
-// Bit scan reverse: find the most significant bit that is set
-// return false if `x==0` (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bsr32(uint32_t x, uint32_t* idx) {
-#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-  unsigned long i;
-  return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
-#else
-  const size_t r = mi_clz((size_t)x);
-  *idx = (~r & (MI_SIZE_BITS - 1)) - (MI_SIZE_SIZE - sizeof(uint32_t));
-  return (x!=0);
-#endif
-}
-
 
 /* --------------------------------------------------------------------------------
   rotate
@@ -338,12 +260,10 @@ static inline size_t mi_rotr(size_t x, size_t r) {
     return mi_builtin(rotateright64)(x,r);
   #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
     return mi_builtin(rotateright32)(x,r);
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    #if MI_SIZE_BITS==32
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotr64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
     return _lrotr(x,(int)r);
-    #else
-    return _rotr64(x,(int)r);
-    #endif
   #else
     // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
@@ -352,30 +272,15 @@ static inline size_t mi_rotr(size_t x, size_t r) {
   #endif
 }
 
-static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) {
-  #if mi_has_builtin(rotateright32)
-    return mi_builtin(rotateright32)(x, r);
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    return _lrotr(x, (int)r);
-  #else
-    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
-    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
-    const unsigned int rshift = (unsigned int)(r) & 31;
-    return ((x >> rshift) | (x << ((-rshift) & 31)));
-  #endif
-}
-
 static inline size_t mi_rotl(size_t x, size_t r) {
   #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
     return mi_builtin(rotateleft64)(x,r);
   #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
     return mi_builtin(rotateleft32)(x,r);
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    #if MI_SIZE_BITS==32
-    return _lrotl(x,(int)r);
-    #else
-    return _rotl64(x,(int)r);
-    #endif
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotl64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
   #else
     // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
     // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
@@ -385,5 +290,4 @@ static inline size_t mi_rotl(size_t x, size_t r) {
 }
 
 
-
 #endif // MI_BITS_H
diff --git a/src/libc.c b/src/libc.c
index 20e9e38b..3fdbf3e7 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -277,10 +277,12 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
 
 
 // --------------------------------------------------------
-// generic trailing and leading zero count
+// generic trailing and leading zero count, and popcount
 // --------------------------------------------------------
 
-uint32_t _mi_ctz_generic32(uint32_t x) {
+#if !MI_HAS_FAST_BITSCAN
+
+static size_t mi_ctz_generic32(uint32_t x) {
   // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
   static const uint8_t debruijn[32] = {
     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
@@ -319,10 +321,71 @@ size_t _mi_clz_generic(size_t x) {
 size_t _mi_ctz_generic(size_t x) {
   if (x==0) return MI_SIZE_BITS;
   #if (MI_SIZE_BITS <= 32)
-    return _mi_ctz_generic32((uint32_t)x);
+    return mi_ctz_generic32((uint32_t)x);
   #else
-    const size_t count = _mi_ctz_generic32((uint32_t)x);
+    const size_t count = mi_ctz_generic32((uint32_t)x);
     if (count < 32) return count;
-    return (32 + _mi_ctz_generic32((uint32_t)(x>>32)));
+    return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
   #endif
 }
+
+#endif // bit scan
+
+#if !MI_HAS_FAST_POPCOUNT
+
+#if MI_SIZE_SIZE == 4
+#define mi_mask_even_bits32      (0x55555555)
+#define mi_mask_even_pairs32     (0x33333333)
+#define mi_mask_even_nibbles32   (0x0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum32(uint32_t x) {
+  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
+  x += (x << 8);
+  x += (x << 16);
+  return (size_t)(x >> 24);
+}
+
+static size_t mi_popcount_generic32(uint32_t x) {
+  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
+  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
+  // into the lower bit-pair:
+  x = x - ((x >> 1) & mi_mask_even_bits32);
+  // add the 2-bit pair results
+  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
+  // add the 4-bit nibble results
+  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
+  // each byte now has a count of its bits, we can sum them now:
+  return mi_byte_sum32(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic32(x);
+}
+
+#else
+#define mi_mask_even_bits64      (0x5555555555555555)
+#define mi_mask_even_pairs64     (0x3333333333333333)
+#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum64(uint64_t x) {
+  x += (x << 8);
+  x += (x << 16);
+  x += (x << 32);
+  return (size_t)(x >> 56);
+}
+
+static size_t mi_popcount_generic64(uint64_t x) {
+  x = x - ((x >> 1) & mi_mask_even_bits64);
+  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
+  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
+  return mi_byte_sum64(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic64(x);
+}
+#endif
+
+#endif // popcount
diff --git a/src/os.c b/src/os.c
index b05068fd..0c020302 100644
--- a/src/os.c
+++ b/src/os.c
@@ -175,7 +175,7 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
 
   MI_UNUSED(tld_stats);
   mi_stats_t* stats = &_mi_stats_main;
-  mi_stat_counter_increase(stats->mmap_calls, 1);
+  _mi_stat_counter_increase(&stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
     if (commit) {

From 2ed6e03d276dc90072236a644e22aea87b108180 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 09:14:16 -0800
Subject: [PATCH 047/264] update optimization on haswell

---
 CMakeLists.txt | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58895d56..52bb60b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhea
 option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
 option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
-option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON)
+option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
@@ -388,21 +388,28 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags -ftls-model=initial-exec)
     endif()
   endif()
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
   if(MI_OVERRIDE)
     list(APPEND mi_cflags -fno-builtin-malloc)
   endif()
   if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
-      set(MI_ARCH_OPT_FLAGS "-march=armv8.1-a")         # fast atomics, since ~ 2016
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2")    # fast bit scan (since 2013)
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics (since 2016)
     endif()
   endif()
 endif()
 
-if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
   list(APPEND mi_cflags /Zc:__cplusplus)
   if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics, since ~ 2016
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "/arch:AVX2")              # fast bit scan (since 2013)
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics (since 2016)
     endif()
   endif()
 endif()
@@ -411,9 +418,9 @@ if(MINGW)
   add_definitions(-D_WIN32_WINNT=0x600)
 endif()
 
-if(MI_ARCH_OPT_FLAGS)
-  list(APPEND mi_cflags ${MI_ARCH_OPT_FLAGS})
-  message(STATUS "Architecture specific optimization is enabled (with ${MI_ARCH_OPT_FLAGS}) (MI_OPT_ARCH=ON)")
+if(MI_OPT_ARCH_FLAGS)
+  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
+  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
 endif()
 
 # extra needed libraries

From 67cc424ada05652c22417edef72bfe1a227ec309 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 09:19:05 -0800
Subject: [PATCH 048/264] delete old files

---
 src/arena-abandon.c |  357 -----------
 src/arena-old.c     |  988 ------------------------------
 src/arena-page.c    |   20 -
 src/bitmap-old.c    |  419 -------------
 src/bitmap-old.h    |  110 ----
 src/page.c          |   53 --
 src/segment-map.c   |  126 ----
 src/segment.c       | 1387 -------------------------------------------
 8 files changed, 3460 deletions(-)
 delete mode 100644 src/arena-abandon.c
 delete mode 100644 src/arena-old.c
 delete mode 100644 src/arena-page.c
 delete mode 100644 src/bitmap-old.c
 delete mode 100644 src/bitmap-old.h
 delete mode 100644 src/segment-map.c
 delete mode 100644 src/segment.c

diff --git a/src/arena-abandon.c b/src/arena-abandon.c
deleted file mode 100644
index 14712886..00000000
--- a/src/arena-abandon.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-#if !defined(MI_IN_ARENA_C)
-#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
-// add includes help an IDE
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-#endif
-
-// Minimal exports for arena-abandoned.
-size_t      mi_arena_id_index(mi_arena_id_t id);
-mi_arena_t* mi_arena_from_index(size_t idx);
-size_t      mi_arena_get_count(void);
-void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
-bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
-
-/* -----------------------------------------------------------
-  Abandoned blocks/segments:
-
-  _mi_arena_segment_clear_abandoned
-  _mi_arena_segment_mark_abandoned
-
-  This is used to atomically abandon/reclaim segments
-  (and crosses the arena API but it is convenient to have here).
-
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment.
-
-  Abandoned segments are atomically marked in the `block_abandoned`
-  bitmap of arenas. Any segments allocated outside arenas are put
-  in the sub-process `abandoned_os_list`. This list is accessed
-  using locks but this should be uncommon and generally uncontended.
-  Reclaim and visiting either scan through the `block_abandoned`
-  bitmaps of the arena's, or visit the `abandoned_os_list`
-
-  A potentially nicer design is to use arena's for everything
-  and perhaps have virtual arena's to map OS allocated memory
-  but this would lack the "density" of our current arena's. TBC.
------------------------------------------------------------ */
-
-
-// reclaim a specific OS abandoned segment; `true` on success.
-// sets the thread_id.
-static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena, remove from list of abandoned os segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
-    return false;  // failed to acquire the lock, we just give up
-  }
-  // remove atomically from the abandoned os list (if possible!)
-  bool reclaimed = false;
-  mi_segment_t* const next = segment->abandoned_os_next;
-  mi_segment_t* const prev = segment->abandoned_os_prev;
-  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
-    #if MI_DEBUG>3
-    // find ourselves in the abandoned list (and check the count)
-    bool found = false;
-    size_t count = 0;
-    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
-      if (current == segment) { found = true; }
-      count++;
-    }
-    mi_assert_internal(found);
-    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
-    #endif
-    // remove (atomically) from the list and reclaim
-    if (prev != NULL) { prev->abandoned_os_next = next; }
-    else { subproc->abandoned_os_list = next; }
-    if (next != NULL) { next->abandoned_os_prev = prev; }
-    else { subproc->abandoned_os_list_tail = prev; }
-    segment->abandoned_os_next = NULL;
-    segment->abandoned_os_prev = NULL;
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
-    if (take_lock) { // don't reset the thread_id when iterating
-      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-    }
-    reclaimed = true;
-  }
-  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
-  return reclaimed;
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // reclaim atomically
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) {
-    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-
-// mark a specific OS segment as abandoned
-static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena; we use a list of abandoned segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (!mi_lock_acquire(&subproc->abandoned_os_lock)) {
-    _mi_error_message(EFAULT, "internal error: failed to acquire the abandoned (os) segment lock to mark abandonment");
-    // we can continue but cannot visit/reclaim such blocks..
-  }
-  else {
-    // push on the tail of the list (important for the visitor)
-    mi_segment_t* prev = subproc->abandoned_os_list_tail;
-    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
-    mi_assert_internal(segment->abandoned_os_prev == NULL);
-    mi_assert_internal(segment->abandoned_os_next == NULL);
-    if (prev != NULL) { prev->abandoned_os_next = segment; }
-    else { subproc->abandoned_os_list = segment; }
-    subproc->abandoned_os_list_tail = segment;
-    segment->abandoned_os_prev = prev;
-    segment->abandoned_os_next = NULL;
-    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count);
-    // and release the lock
-    mi_lock_release(&subproc->abandoned_os_lock);
-  }
-  return;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
-{
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    mi_arena_segment_os_mark_abandoned(segment);
-    return;
-  }
-  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // set abandonment atomically
-  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-
-/* -----------------------------------------------------------
-  Iterate through the abandoned blocks/segments using a cursor.
-  This is used for reclaiming and abandoned block visiting.
------------------------------------------------------------ */
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  current->bitmap_idx = 0;
-  current->subproc = subproc;
-  current->visit_all = visit_all;
-  current->hold_visit_lock = false;
-  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
-  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
-  const size_t max_arena = mi_arena_get_count();
-  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
-    // for a heap that is bound to one arena, only visit that arena
-    current->start = mi_arena_id_index(heap->arena_id);
-    current->end = current->start + 1;
-    current->os_list_count = 0;
-  }
-  else {
-    // otherwise visit all starting at a random location
-    if (abandoned_count > abandoned_list_count && max_arena > 0) {
-      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
-      current->end = current->start + max_arena;
-    }
-    else {
-      current->start = 0;
-      current->end = 0;
-    }
-    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
-  }
-  mi_assert_internal(current->start <= max_arena);
-}
-
-void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
-  if (current->hold_visit_lock) {
-    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
-    current->hold_visit_lock = false;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
-  // try to reclaim an abandoned segment in the arena atomically
-  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  // check that the segment belongs to our sub-process
-  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
-  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
-  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
-  if (segment->subproc != subproc) {
-    // it is from another sub-process, re-mark it and continue searching
-    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
-    return NULL;
-  }
-  else {
-    // success, we unabandoned a segment in our sub-process
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    return segment;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
-  const size_t max_arena = mi_arena_get_count();
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
-  // visit arena's (from the previous cursor)
-  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
-    // index wraps around
-    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
-    mi_arena_t* arena = mi_arena_from_index(arena_idx);
-    if (arena != NULL) {
-      bool has_lock = false;
-      // visit the abandoned fields (starting at previous_idx)
-      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // we only take the arena lock if there are actually abandoned segments present
-          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
-            if (!has_lock) {
-              if (previous->visit_all) {
-                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
-              }
-              // skip to next arena
-              break;
-            }
-          }
-          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
-              if (segment != NULL) {
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
-                return segment;
-              }
-            }
-          }
-        }
-      }
-      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-    }
-  }
-  return NULL;
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
-  // go through the abandoned_os_list
-  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
-  // The lock is released when the cursor is released.
-  if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock)
-      : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
-    if (!previous->hold_visit_lock) {
-      if (previous->visit_all) {
-        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
-      }
-      return NULL; // we cannot get the lock, give up
-    }
-  }
-  // One list entry at a time
-  while (previous->os_list_count > 0) {
-    previous->os_list_count--;
-    const bool has_lock = mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    if (has_lock) {
-      mi_segment_t* segment = previous->subproc->abandoned_os_list;
-      // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-      if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-        mi_lock_release(&previous->subproc->abandoned_os_lock);
-        return segment;
-      }
-      // already abandoned, try again
-      mi_lock_release(&previous->subproc->abandoned_os_lock);
-    }
-    else {
-      _mi_error_message(EFAULT, "failed to acquire abandoned OS list lock during abandoned block visit\n");
-      return NULL;
-    }
-  }
-  // done
-  mi_assert_internal(previous->os_list_count == 0);
-  return NULL;
-}
-
-
-// reclaim abandoned segments
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
-  if (previous->start < previous->end) {
-    // walk the arena
-    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
-    if (segment != NULL) { return segment; }
-  }
-  // no entries in the arena's anymore, walk the abandoned OS list
-  mi_assert_internal(previous->start == previous->end);
-  return mi_arena_segment_clear_abandoned_next_list(previous);
-}
-
-
-bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  // (unfortunately) the visit_abandoned option must be enabled from the start.
-  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
-  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
-    return false;
-  }
-  mi_arena_field_cursor_t current;0
-  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
-  mi_segment_t* segment;
-  bool ok = true;
-  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
-    _mi_arena_segment_mark_abandoned(segment);
-  }
-  _mi_arena_field_cursor_done(&current);
-  return ok;
-}
diff --git a/src/arena-old.c b/src/arena-old.c
deleted file mode 100644
index 3f41e9c7..00000000
--- a/src/arena-old.c
+++ /dev/null
@@ -1,988 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-"Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
-In contrast to the rest of mimalloc, the arenas are shared between
-threads and need to be accessed using atomic operations.
-
-Arenas are also used to for huge OS page (1GiB) reservations or for reserving
-OS memory upfront which can be improve performance or is sometimes needed
-on embedded devices. We can also employ this with WASI or `sbrk` systems
-to reserve large arenas upfront and be able to reuse the memory more effectively.
-
-The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
------------------------------------------------------------------------------*/
-
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-#include "bitmap.h"
-
-
-/* -----------------------------------------------------------
-  Arena allocation
------------------------------------------------------------ */
-
-// A memory arena descriptor
-typedef struct mi_arena_s {
-  mi_arena_id_t       id;                   // arena id; 0 for non-specific
-  mi_memid_t          memid;                // memid of the memory area
-  _Atomic(uint8_t*)start;                // the start of the memory area
-  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_SLICE_SIZE`)
-  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
-  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
-  int                 numa_node;            // associated NUMA node
-  bool                exclusive;            // only allow allocations if specifically for this arena
-  bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
-  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
-  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
-} mi_arena_t;
-
-
-#define MI_ARENA_SLICE_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_SLICE_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
-
-// The available arenas
-static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
-
-#define MI_IN_ARENA_C
-#include "arena-abandon.c"
-#undef MI_IN_ARENA_C
-
-/* -----------------------------------------------------------
-  Arena id's
-  id = arena_index + 1
------------------------------------------------------------ */
-
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
-}
-
-mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
-}
-
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
-}
-
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
-  if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
-  }
-  else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
-  }
-}
-
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
-}
-
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
-}
-
-
-/* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the
-  lower 8 bits are the arena id, and the upper bits the block index.
------------------------------------------------------------ */
-
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
-}
-
-static size_t mi_arena_block_size(size_t bcount) {
-  return (bcount * MI_ARENA_SLICE_SIZE);
-}
-
-static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_arena_block_size(arena->block_count);
-}
-
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
-  memid.mem.arena.block_index = bitmap_index;
-  memid.mem.arena.is_exclusive = is_exclusive;
-  return memid;
-}
-
-bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *bitmap_index = memid.mem.arena.block_index;
-  return memid.mem.arena.is_exclusive;
-}
-
-
-
-/* -----------------------------------------------------------
-  Special static area for mimalloc internal structures
-  to avoid OS calls (for example, for the arena metadata (~= 256b))
------------------------------------------------------------ */
-
-#define MI_ARENA_STATIC_MAX  ((MI_INTPTR_SIZE/2)*MI_KiB)  // 4 KiB on 64-bit
-
-static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
-static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
-
-static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
-  *memid = _mi_memid_none();
-  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
-  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
-
-  // try to claim space
-  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
-  const size_t oversize = size + alignment - 1;
-  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
-  size_t top = oldtop + oversize;
-  if (top > MI_ARENA_STATIC_MAX) {
-    // try to roll back, ok if this fails
-    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
-    return NULL;
-  }
-
-  // success
-  *memid = _mi_memid_create(MI_MEM_STATIC);
-  memid->initially_zero = true;
-  const size_t start = _mi_align_up(oldtop, alignment);
-  uint8_t* const p = &mi_arena_static[start];
-  _mi_memzero_aligned(p, size);
-  return p;
-}
-
-void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
-  *memid = _mi_memid_none();
-
-  // try static
-  void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid);
-  if (p != NULL) return p;
-
-  // or fall back to the OS
-  p = _mi_os_alloc(size, memid, &_mi_stats_main);
-  if (p == NULL) return NULL;
-
-  // zero the OS memory if needed
-  if (!memid->initially_zero) {
-    _mi_memzero_aligned(p, size);
-    memid->initially_zero = true;
-  }
-  return p;
-}
-
-void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
-  if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid, &_mi_stats_main);
-  }
-  else {
-    mi_assert(memid.memkind == MI_MEM_STATIC);
-  }
-}
-
-void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
-}
-
-
-/* -----------------------------------------------------------
-  Thread safe allocation in an arena
------------------------------------------------------------ */
-
-// claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
-{
-  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
-    return true;
-  };
-  return false;
-}
-
-
-/* -----------------------------------------------------------
-  Arena Allocation
------------------------------------------------------------ */
-
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
-  mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
-
-  // claimed it!
-  void* p = mi_arena_block_start(arena, bitmap_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
-  memid->is_pinned = arena->memid.is_pinned;
-
-  // none of the claimed blocks should be scheduled for a decommit
-  if (arena->blocks_purge != NULL) {
-    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
-    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  // set the dirty bits (todo: no need for an atomic op here?)
-  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
-    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  }
-
-  // set commit state
-  if (arena->blocks_committed == NULL) {
-    // always committed
-    memid->initially_committed = true;
-  }
-  else if (commit) {
-    // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->initially_committed = true;
-    bool any_uncommitted;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
-    if (any_uncommitted) {
-      bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
-        memid->initially_committed = false;
-      }
-      else {
-        if (commit_zero) { memid->initially_zero = true; }
-      }
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  return p;
-}
-
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_arena_block_size(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-                    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-                                                  bool commit, bool allow_large,
-                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
-
-// try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
-{
-  if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
-  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
-
-  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
-  if (arena_reserve == 0) return false;
-
-  if (!_mi_os_has_virtual_reserve()) {
-    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
-  }
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
-  arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
-  if (arena_count >= 8 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 );
-    size_t reserve = 0;
-    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
-      arena_reserve = reserve;
-    }
-  }
-  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-
-  // commit eagerly?
-  bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
-  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
-
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
-}
-
-
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = _mi_memid_none();
-
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
-
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
-    }
-  }
-
-  // if we cannot use OS allocation, return NULL
-  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
-    errno = ENOMEM;
-    return NULL;
-  }
-
-  // finally, fall back to the OS
-  if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
-  }
-  else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }
-}
-
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
-}
-
-
-void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
-  if (size != NULL) *size = 0;
-  size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
-  return arena->start;
-}
-
-
-/* -----------------------------------------------------------
-  Arena purge
------------------------------------------------------------ */
-
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
-
-// reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_committed != NULL);
-  mi_assert_internal(arena->blocks_purge != NULL);
-  mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_arena_block_size(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx);
-  bool needs_recommit;
-  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
-    // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
-  }
-  else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
-  }
-
-  // clear the purged blocks
-  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
-  // update committed bitmap
-  if (needs_recommit) {
-    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
-  }
-}
-
-// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
-// Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_purge != NULL);
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0) return;  // is purging allowed at all?
-
-  if (_mi_preloading() || delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);
-  }
-  else {
-    // schedule decommit
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
-    }
-    else {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
-    }
-    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
-  }
-}
-
-// purge a range of blocks
-// return true if the full range was purged.
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startidx + bitlen;
-  size_t bitidx = startidx;
-  bool all_purged = false;
-  while (bitidx < endidx) {
-    // count consecutive ones in the purge mask
-    size_t count = 0;
-    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
-      count++;
-    }
-    if (count > 0) {
-      // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
-      mi_arena_purge(arena, range_idx, count, stats);
-      if (count == bitlen) {
-        all_purged = true;
-      }
-    }
-    bitidx += (count+1); // +1 to skip the zero bit (or end)
-  }
-  return all_purged;
-}
-
-// returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
-{
-  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
-  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-  if (expire == 0) return false;
-  if (!force && expire > now) return false;
-
-  // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
-
-  // potential purges scheduled, walk through the bitmap
-  bool any_purged = false;
-  bool full_purge = true;
-  for (size_t i = 0; i < arena->field_count; i++) {
-    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
-    if (purge != 0) {
-      size_t bitidx = 0;
-      while (bitidx < MI_BITMAP_FIELD_BITS) {
-        // find consecutive range of ones in the purge mask
-        size_t bitlen = 0;
-        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
-          bitlen++;
-        }
-        // temporarily claim the purge range as "in-use" to be thread-safe with allocation
-        // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
-        while( bitlen > 0 ) {
-          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
-            break;
-          }
-          bitlen--;
-        }
-        // actual claimed bits at `in_use`
-        if (bitlen > 0) {
-          // read purge again now that we have the in_use bits
-          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
-            full_purge = false;
-          }
-          any_purged = true;
-          // release the claimed `in_use` bits again
-          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
-        }
-        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitidx
-    } // purge != 0
-  }
-  // if not fully purged, make sure to purge again in the future
-  if (!full_purge) {
-    const long delay = mi_arena_purge_delay();
-    mi_msecs_t expected = 0;
-    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
-  }
-  return any_purged;
-}
-
-static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
-
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
-  if (max_arena == 0) return;
-
-  // allow only one thread to purge at a time
-  static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard)
-  {
-    mi_msecs_t now = _mi_clock_now();
-    size_t max_purge_count = (visit_all ? max_arena : 1);
-    for (size_t i = 0; i < max_arena; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-      if (arena != NULL) {
-        if (mi_arena_try_purge(arena, now, force, stats)) {
-          if (max_purge_count <= 1) break;
-          max_purge_count--;
-        }
-      }
-    }
-  }
-}
-
-
-/* -----------------------------------------------------------
-  Arena free
------------------------------------------------------------ */
-
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
-  mi_assert_internal(committed_size <= size);
-  if (p==NULL) return;
-  if (size==0) return;
-  const bool all_committed = (committed_size == size);
-
-  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-  mi_track_mem_undefined(p,size);
-
-  if (mi_memkind_is_os(memid.memkind)) {
-    // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-    }
-    _mi_os_free(p, size, memid, stats);
-  }
-  else if (memid.memkind == MI_MEM_ARENA) {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t bitmap_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
-    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-
-    // potentially decommit
-    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
-      mi_assert_internal(all_committed);
-    }
-    else {
-      mi_assert_internal(arena->blocks_committed != NULL);
-      mi_assert_internal(arena->blocks_purge != NULL);
-
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
-        mi_track_mem_noaccess(p,size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
-    }
-
-    // and make it available to others again
-    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
-      return;
-    };
-  }
-  else {
-    // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
-  }
-
-  // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t new_max_arena = 0;
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL) {
-      mi_lock_done(&arena->abandoned_visit_lock);
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
-      }
-      else {
-        new_max_arena = i;
-      }
-      _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size);
-    }
-  }
-
-  // try to lower the max arena.
-  size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
-}
-
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
-  mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
-}
-
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/* -----------------------------------------------------------
-  Add an arena.
------------------------------------------------------------ */
-
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
-  mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
-  mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
-
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
-  }
-  _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
-  return true;
-}
-
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
-{
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  if (size < MI_ARENA_SLICE_SIZE) return false;
-
-  if (is_large) {
-    mi_assert_internal(memid.initially_committed && memid.is_pinned);
-  }
-
-  const size_t bcount = size / MI_ARENA_SLICE_SIZE;
-  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
-  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_memid_t meta_memid;
-  mi_arena_t* arena   = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid);
-  if (arena == NULL) return false;
-
-  // already zero'd due to zalloc
-  // _mi_memzero(arena, asize);
-  arena->id = _mi_arena_id_none();
-  arena->memid = memid;
-  arena->exclusive = exclusive;
-  arena->meta_size = asize;
-  arena->meta_memid = meta_memid;
-  arena->block_count = bcount;
-  arena->field_count = fields;
-  arena->start = (uint8_t*)start;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
-  arena->purge_expire = 0;
-  arena->search_idx   = 0;
-  mi_lock_init(&arena->abandoned_visit_lock);
-  // consecutive bitmaps
-  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
-  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
-  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
-  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
-  // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
-    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
-  }
-
-  // and claim leftover blocks if needed (so we never allocate there)
-  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
-  mi_assert_internal(post >= 0);
-  if (post > 0) {
-    // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
-  }
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
-
-}
-
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
-  memid.initially_committed = is_committed;
-  memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one block
-  mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
-  if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
-    return ENOMEM;
-  }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
-  return 0;
-}
-
-
-// Manage a range of regular OS memory
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
-  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
-  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
-}
-
-
-/* -----------------------------------------------------------
-  Debugging
------------------------------------------------------------ */
-
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
-  size_t bcount = 0;
-  size_t inuse_count = 0;
-  for (size_t i = 0; i < field_count; i++) {
-    char buf[MI_BITMAP_FIELD_BITS + 1];
-    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
-    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
-      if (bcount < block_count) {
-        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
-        if (inuse) inuse_count++;
-        buf[bit] = (inuse ? 'x' : '.');
-      }
-      else {
-        buf[bit] = ' ';
-      }
-    }
-    buf[MI_BITMAP_FIELD_BITS] = 0;
-    _mi_verbose_message("%s  %s\n", prefix, buf);
-  }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
-  return inuse_count;
-}
-
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t inuse_total = 0;
-  size_t abandoned_total = 0;
-  size_t purge_total = 0;
-  for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena == NULL) break;
-    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_SLICE_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
-    if (show_inuse) {
-      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
-    }
-    if (arena->blocks_committed != NULL) {
-      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
-    }
-    if (show_abandoned) {
-      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);
-    }
-    if (show_purge && arena->blocks_purge != NULL) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
-    }
-  }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
-  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
-}
-
-
-/* -----------------------------------------------------------
-  Reserve a huge page arena.
------------------------------------------------------------ */
-// reserve at a specific numa node
-int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
-  if (pages==0) return 0;
-  if (numa_node < -1) numa_node = -1;
-  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  size_t hsize = 0;
-  size_t pages_reserved = 0;
-  mi_memid_t memid;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
-  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
-    return ENOMEM;
-  }
-  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
-
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
-    return ENOMEM;
-  }
-  return 0;
-}
-
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
-  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
-}
-
-// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages == 0) return 0;
-
-  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
-  const size_t pages_per = pages / numa_count;
-  const size_t pages_mod = pages % numa_count;
-  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
-
-  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
-    if (err) return err;
-    if (pages < node_pages) {
-      pages = 0;
-    }
-    else {
-      pages -= node_pages;
-    }
-  }
-
-  return 0;
-}
-
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  MI_UNUSED(max_secs);
-  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
-  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
-  return err;
-}
-
-
diff --git a/src/arena-page.c b/src/arena-page.c
deleted file mode 100644
index 93d25dbf..00000000
--- a/src/arena-page.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-
------------------------------------------------------------------------------*/
-
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-
-
-/* -----------------------------------------------------------
-  Arena allocation
------------------------------------------------------------ */
-
diff --git a/src/bitmap-old.c b/src/bitmap-old.c
deleted file mode 100644
index 3e6311dc..00000000
--- a/src/bitmap-old.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
----------------------------------------------------------------------------- */
-
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/bits.h"
-#include "bitmap.h"
-
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
-
-// The bit mask for a given number of blocks at a specified bit index.
-static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  if (count == 0) return 0;
-  return ((((size_t)1 << count) - 1) << bitidx);
-}
-
-
-
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
-
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map  = mi_atomic_load_relaxed(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
-
-  // search for 0-bit sequence of length count
-  const size_t mask = mi_bitmap_mask_(count, 0);
-  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
-
-#if MI_HAS_FAST_BITSCAN
-  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
-#endif
-  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    const size_t mapm = (map & m);
-    if (mapm == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = (map | m);
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
-        // no success, another thread claimed concurrently.. keep going (with updated `map`)
-        continue;
-      }
-      else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
-        return true;
-      }
-    }
-    else {
-      // on to the next bit range
-#if MI_HAS_FAST_BITSCAN
-      mi_assert_internal(mapm != 0);
-      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
-      mi_assert_internal(shift > 0 && shift <= count);
-#else
-      const size_t shift = 1;
-#endif
-      bitidx += shift;
-      m <<= shift;
-    }
-  }
-  // no bits found
-  return false;
-}
-
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
-}
-
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
-  return ((prev & mask) == 0);
-}
-
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
-  return ((field & mask) == mask);
-}
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {
-    if ((expected & mask) != 0) return false;
-  }
-  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
-  mi_assert_internal((expected & mask) == 0);
-  return true;
-}
-
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
-}
-
-
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
-
-// Try to atomically claim a sequence of `count` bits starting from the field
-// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-
-  // check initial trailing zeros
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);
-  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
-  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
-  if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
-  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
-  // scan ahead
-  size_t found = initial;
-  size_t mask = 0;     // mask bits for the final field
-  while(found < count) {
-    field++;
-    map = mi_atomic_load_relaxed(field);
-    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
-    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
-    mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;  // some part is already claimed
-    found += mask_bits;
-  }
-  mi_assert_internal(field < &bitmap[bitmap_fields]);
-
-  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
-  // now try to claim the range atomically
-  mi_bitmap_field_t* const final_field = field;
-  const size_t final_mask = mask;
-  mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
-  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
-
-  // initial field
-  size_t newmap;
-  field = initial_field;
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | initial_mask);
-    if ((map & initial_mask) != 0) { goto rollback; };
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // intermediate fields
-  while (++field < final_field) {
-    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    map = 0;
-    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
-  }
-
-  // final field
-  mi_assert_internal(field == final_field);
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | final_mask);
-    if ((map & final_mask) != 0) { goto rollback; }
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // claimed!
-  mi_stat_counter_increase(stats->arena_crossover_count,1);
-  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
-  return true;
-
-rollback:
-  // roll back intermediate fields
-  // (we just failed to claim `field` so decrement first)
-  while (--field > initial_field) {
-    newmap = 0;
-    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
-    mi_atomic_store_release(field, newmap);
-  }
-  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
-    map = mi_atomic_load_relaxed(field);
-    do {
-      mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = (map & ~initial_mask);
-    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }
-  mi_stat_counter_increase(stats->arena_rollback_count,1);
-  // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
-  }
-  else {
-    return false;
-  }
-}
-
-
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
-  mi_assert_internal(count > 0);
-  if (count <= 2) {
-    // we don't bother with crossover fields for small counts
-    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
-  }
-
-  // visit the fields
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    // first try to claim inside a field
-    /*
-    if (count <= MI_BITMAP_FIELD_BITS) {
-      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-        return true;
-      }
-    }
-    */
-    // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED(bitmap_fields);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
-    *pre_mask = mi_bitmap_mask_(count, bitidx);
-    *mid_mask = 0;
-    *post_mask = 0;
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
-    return 0;
-  }
-  else {
-    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
-    mi_assert_internal(pre_bits < count);
-    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
-    count -= pre_bits;
-    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
-    *mid_mask = MI_BITMAP_FIELD_FULL;
-    count %= MI_BITMAP_FIELD_BITS;
-    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
-    return mid_count;
-  }
-}
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_one = true;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
-  if ((prev & pre_mask) != pre_mask) all_one = false;
-  while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
-    if ((prev & mid_mask) != mid_mask) all_one = false;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
-    if ((prev & post_mask) != post_mask) all_one = false;
-  }
-  return all_one;
-}
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_zero = true;
-  bool any_zero = false;
-  _Atomic(size_t)*field = &bitmap[idx];
-  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) all_zero = false;
-  if ((prev & pre_mask) != pre_mask) any_zero = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) all_zero = false;
-    if ((prev & mid_mask) != mid_mask) any_zero = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) all_zero = false;
-    if ((prev & post_mask) != post_mask) any_zero = true;
-  }
-  if (pany_zero != NULL) { *pany_zero = any_zero; }
-  return all_zero;
-}
-
-
-// Returns `true` if all `count` bits were 1.
-// `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_ones = true;
-  bool any_ones = false;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_load_relaxed(field++);
-  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) any_ones = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_load_relaxed(field++);
-    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) any_ones = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_load_relaxed(field);
-    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) any_ones = true;
-  }
-  if (pany_ones != NULL) { *pany_ones = any_ones; }
-  return all_ones;
-}
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
-}
diff --git a/src/bitmap-old.h b/src/bitmap-old.h
deleted file mode 100644
index f8898935..00000000
--- a/src/bitmap-old.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
----------------------------------------------------------------------------- */
-#pragma once
-#ifndef MI_BITMAP_H
-#define MI_BITMAP_H
-
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
-
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
-
-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
-
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
-
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return mi_bitmap_index_create_ex(idx,bitidx);
-}
-
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
-
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
-
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
-
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
-
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
-
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-#endif
diff --git a/src/page.c b/src/page.c
index 54e7b539..f21bf91f 100644
--- a/src/page.c
+++ b/src/page.c
@@ -339,59 +339,6 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
   }
 }
 
-/*
-// Abandon a page with used blocks at the end of a thread.
-// Note: only call if it is ensured that no references exist from
-// the `page->heap->thread_delayed_free` into this page.
-// Currently only called through `mi_heap_collect_ex` which ensures this.
-void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
-  mi_assert_internal(page != NULL);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(mi_page_heap(page) != NULL);
-
-  mi_heap_t* pheap = mi_page_heap(page);
-
-  // remove from our page list
-  mi_page_queue_remove(pq, page);
-
-  // page is no longer associated with our heap
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_page_set_heap(page, NULL);
-
-#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
-  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
-    mi_assert_internal(_mi_ptr_page(block) != page);
-  }
-#endif
-
-  // and abandon it
-  mi_assert_internal(mi_page_is_abandoned(page));
-  _mi_arena_page_abandon(page, pheap->tld);
-}
-
-// force abandon a page
-void _mi_page_force_abandon(mi_page_t* page) {
-  mi_heap_t* heap = mi_page_heap(page);
-  // mark page as not using delayed free
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-
-  // ensure this page is no longer in the heap delayed free list
-  _mi_heap_delayed_free_all(heap);
-  // TODO: can we still access the page meta-info even if it is freed?
-  if (page->capacity == 0) return; // it may have been freed now
-
-  // and now unlink it from the page queue and abandon (or free)
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  if (mi_page_all_free(page)) {
-    _mi_page_free(page, pq, false);
-  }
-  else {
-    _mi_page_abandon(page, pq);
-  }
-}
-*/
 
 // Free a page with no more free blocks
 void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
diff --git a/src/segment-map.c b/src/segment-map.c
deleted file mode 100644
index 2c3964fe..00000000
--- a/src/segment-map.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
------------------------------------------------------------ */
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-// Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
-#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
-#elif (MI_INTPTR_SIZE > 4)
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
-#else
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (UINT32_MAX)
-#endif
-
-#define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
-#define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
-#define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
-#define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)
-#define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
-#define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
-
-// A part of the segment map.
-typedef struct mi_segmap_part_s {
-  mi_memid_t memid;
-  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
-} mi_segmap_part_t;
-
-// Allocate parts on-demand to reduce .bss footprint
-static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
-
-static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
-  // note: segment can be invalid or NULL.
-  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  *idx = 0;
-  *bitidx = 0;  
-  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
-  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
-  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
-  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
-
-  // allocate on demand to reduce .bss footprint
-  if (part == NULL) {
-    if (!create_on_demand) return NULL;
-    mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid, NULL);
-    if (part == NULL) return NULL;
-    mi_segmap_part_t* expected = NULL;
-    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
-      _mi_os_free(part, sizeof(mi_segmap_part_t), memid, NULL);
-      part = expected;
-      if (part == NULL) return NULL;
-    }
-  }
-  mi_assert(part != NULL);
-  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
-  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
-  *idx = bitofs / MI_INTPTR_BITS;
-  *bitidx = bitofs % MI_INTPTR_BITS;
-  return part;
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return;
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
-  if (part == NULL) return NULL;  
-  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
-    return segment; // yes, allocated by us
-  }
-  return NULL;
-}
-
-// Is this a valid pointer in our heap?
-static bool mi_is_valid_pointer(const void* p) {
-  // first check if it is in an arena, then check if it is OS allocated
-  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
diff --git a/src/segment.c b/src/segment.c
deleted file mode 100644
index 74abcdbc..00000000
--- a/src/segment.c
+++ /dev/null
@@ -1,1387 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-#include <string.h>  // memset
-#include <stdio.h>
-
-#define MI_PAGE_HUGE_ALIGN  (256*1024)
-
-static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
-
-/* --------------------------------------------------------------------------------
-  Segment allocation
-  We allocate pages inside bigger "segments" (4MiB on 64-bit). This is to avoid
-  splitting VMA's on Linux and reduce fragmentation on other OS's.
-  Each thread owns its own segments.
-
-  Currently we have:
-  - small pages (64KiB), 64 in one segment
-  - medium pages (512KiB), 8 in one segment
-  - large pages (4MiB), 1 in one segment
-  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
-    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
-
-  The memory for a segment is usually committed on demand.
-  (i.e. we are careful to not touch the memory until we actually allocate a block there)
-
-  If a  thread ends, it "abandons" pages that still contain live blocks.
-  Such segments are abondoned and these can be reclaimed by still running threads,
-  (much like work-stealing).
--------------------------------------------------------------------------------- */
-
-
-/* -----------------------------------------------------------
-  Queue of segments containing free pages
------------------------------------------------------------ */
-
-#if (MI_DEBUG>=3)
-static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, const mi_segment_t* segment) {
-  mi_assert_internal(segment != NULL);
-  mi_segment_t* list = queue->first;
-  while (list != NULL) {
-    if (list == segment) break;
-    mi_assert_internal(list->next==NULL || list->next->prev == list);
-    mi_assert_internal(list->prev==NULL || list->prev->next == list);
-    list = list->next;
-  }
-  return (list == segment);
-}
-#endif
-
-/*
-static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) {
-  return (queue->first == NULL);
-}
-*/
-
-static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) {
-  mi_assert_expensive(mi_segment_queue_contains(queue, segment));
-  if (segment->prev != NULL) segment->prev->next = segment->next;
-  if (segment->next != NULL) segment->next->prev = segment->prev;
-  if (segment == queue->first) queue->first = segment->next;
-  if (segment == queue->last)  queue->last = segment->prev;
-  segment->next = NULL;
-  segment->prev = NULL;
-}
-
-static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment) {
-  mi_assert_expensive(!mi_segment_queue_contains(queue, segment));
-  segment->next = NULL;
-  segment->prev = queue->last;
-  if (queue->last != NULL) {
-    mi_assert_internal(queue->last->next == NULL);
-    queue->last->next = segment;
-    queue->last = segment;
-  }
-  else {
-    queue->last = queue->first = segment;
-  }
-}
-
-static mi_segment_queue_t* mi_segment_free_queue_of_kind(mi_page_kind_t kind, mi_segments_tld_t* tld) {
-  if (kind == MI_PAGE_SMALL) return &tld->small_free;
-  else if (kind == MI_PAGE_MEDIUM) return &tld->medium_free;
-  else return NULL;
-}
-
-static mi_segment_queue_t* mi_segment_free_queue(const mi_segment_t* segment, mi_segments_tld_t* tld) {
-  return mi_segment_free_queue_of_kind(segment->page_kind, tld);
-}
-
-// remove from free queue if it is in one
-static void mi_segment_remove_from_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld); // may be NULL
-  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
-  if (in_queue) {
-    mi_segment_queue_remove(queue, segment);
-  }
-}
-
-static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_segment_enqueue(mi_segment_free_queue(segment, tld), segment);
-}
-
-
-/* -----------------------------------------------------------
- Invariant checking
------------------------------------------------------------ */
-
-#if (MI_DEBUG >= 2) || (MI_SECURE >= 2)
-static size_t mi_segment_page_size(const mi_segment_t* segment) {
-  if (segment->capacity > 1) {
-    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
-    return ((size_t)1 << segment->page_shift);
-  }
-  else {
-    mi_assert_internal(segment->page_kind >= MI_PAGE_LARGE);
-    return segment->segment_size;
-  }
-}
-#endif
-
-#if (MI_DEBUG>=2)
-static bool mi_pages_purge_contains(const mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_page_t* p = tld->pages_purge.first;
-  while (p != NULL) {
-    if (p == page) return true;
-    p = p->next;
-  }
-  return false;
-}
-#endif
-
-#if (MI_DEBUG>=3)
-static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(segment->used <= segment->capacity);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment
-  size_t nfree = 0;
-  for (size_t i = 0; i < segment->capacity; i++) {
-    const mi_page_t* const page = &segment->pages[i];
-    if (!page->segment_in_use) {
-      nfree++;
-    }
-    if (page->segment_in_use) {
-      mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-    }
-    mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
-  }
-  mi_assert_internal(nfree + segment->used == segment->capacity);
-  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
-  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
-                     (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
-  return true;
-}
-#endif
-
-static bool mi_page_not_in_queue(const mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(page != NULL);
-  if (page->next != NULL || page->prev != NULL) {
-    mi_assert_internal(mi_pages_purge_contains(page, tld));
-    return false;
-  }
-  else {
-    // both next and prev are NULL, check for singleton list
-    return (tld->pages_purge.first != page && tld->pages_purge.last != page);
-  }
-}
-
-
-/* -----------------------------------------------------------
-  Guard pages
------------------------------------------------------------ */
-
-static void mi_segment_protect_range(void* p, size_t size, bool protect) {
-  if (protect) {
-    _mi_os_protect(p, size);
-  }
-  else {
-    _mi_os_unprotect(p, size);
-  }
-}
-
-static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t* tld) {
-  // add/remove guard pages
-  if (MI_SECURE != 0) {
-    // in secure mode, we set up a protected page in between the segment info and the page data
-    const size_t os_psize = _mi_os_page_size();
-    mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
-    mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0);
-    mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect);
-    #if (MI_SECURE >= 2)
-    if (segment->capacity == 1)
-    #endif
-    {
-      // and protect the last (or only) page too
-      mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE);
-      uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize;
-      if (protect && !segment->memid.initially_committed) {
-        if (protect) {
-          // ensure secure page is committed
-          if (_mi_os_commit(start, os_psize, NULL, tld->stats)) {  // if this fails that is ok (as it is an unaccessible page)
-            mi_segment_protect_range(start, os_psize, protect);
-          }
-        }
-      }
-      else {
-        mi_segment_protect_range(start, os_psize, protect);
-      }
-    }
-    #if (MI_SECURE >= 2)
-    else {
-      // or protect every page
-      const size_t page_size = mi_segment_page_size(segment);
-      for (size_t i = 0; i < segment->capacity; i++) {
-        if (segment->pages[i].is_committed) {
-          mi_segment_protect_range((uint8_t*)segment + (i+1)*page_size - os_psize, os_psize, protect);
-        }
-      }
-    }
-    #endif
-  }
-}
-
-/* -----------------------------------------------------------
-  Page reset
------------------------------------------------------------ */
-
-static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
-  // todo: should we purge the guard page as well when MI_SECURE>=2 ?
-  mi_assert_internal(page->is_committed);
-  mi_assert_internal(!page->segment_in_use);
-  if (!segment->allow_purge) return;
-  mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->free == NULL);
-  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-  size_t psize;
-  void* start = mi_segment_raw_page_start(segment, page, &psize);
-  const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
-  if (needs_recommit) { page->is_committed = false; }
-}
-
-static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
-  if (page->is_committed) return true;
-  mi_assert_internal(segment->allow_decommit);
-  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-
-  size_t psize;
-  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
-  bool is_zero = false;
-  const size_t gsize = (MI_SECURE >= 2 ? _mi_os_page_size() : 0);
-  bool ok = _mi_os_commit(start, psize + gsize, &is_zero, tld->stats);
-  if (!ok) return false; // failed to commit!
-  page->is_committed = true;
-  page->used = 0;
-  page->free = NULL;
-  page->is_zero_init = is_zero;
-  if (gsize > 0) {
-    mi_segment_protect_range(start + psize, gsize, true);
-  }
-  return true;
-}
-
-
-/* -----------------------------------------------------------
-  The free page queue
------------------------------------------------------------ */
-
-// we re-use the `free` field for the expiration counter. Since this is a
-// a pointer size field while the clock is always 64-bit we need to guard
-// against overflow, we use substraction to check for expiry which works
-// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
-static uint32_t mi_page_get_expire( mi_page_t* page ) {
-  return (uint32_t)((uintptr_t)page->free);
-}
-
-static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) {
-  page->free = (mi_block_t*)((uintptr_t)expire);
-}
-
-static void mi_page_purge_set_expire(mi_page_t* page) {
-  mi_assert_internal(mi_page_get_expire(page)==0);
-  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
-  mi_page_set_expire(page, expire);
-}
-
-// we re-use the `free` field for the expiration counter. Since this is a
-// a pointer size field while the clock is always 64-bit we need to guard
-// against overflow, we use substraction to check for expiry which work
-// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
-static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) {
-  int32_t expire = (int32_t)mi_page_get_expire(page);
-  return (((int32_t)now - expire) >= 0);
-}
-
-static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_page_not_in_queue(page,tld));
-  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-  mi_assert_internal(_mi_page_segment(page)==segment);
-  if (!segment->allow_purge) return;
-
-  if (mi_option_get(mi_option_purge_delay) == 0) {
-    // purge immediately?
-    mi_page_purge(segment, page, tld);
-  }
-  else if (mi_option_get(mi_option_purge_delay) > 0) {   // no purging if the delay is negative
-    // otherwise push on the delayed page reset queue
-    mi_page_queue_t* pq = &tld->pages_purge;
-    // push on top
-    mi_page_purge_set_expire(page);
-    page->next = pq->first;
-    page->prev = NULL;
-    if (pq->first == NULL) {
-      mi_assert_internal(pq->last == NULL);
-      pq->first = pq->last = page;
-    }
-    else {
-      pq->first->prev = page;
-      pq->first = page;
-    }
-  }
-}
-
-static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) {
-  if (mi_page_not_in_queue(page,tld)) return;
-
-  mi_page_queue_t* pq = &tld->pages_purge;
-  mi_assert_internal(pq!=NULL);
-  mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_page_get_expire(page) != 0);
-  mi_assert_internal(mi_pages_purge_contains(page, tld));
-  if (page->prev != NULL) page->prev->next = page->next;
-  if (page->next != NULL) page->next->prev = page->prev;
-  if (page == pq->last)  pq->last = page->prev;
-  if (page == pq->first) pq->first = page->next;
-  page->next = page->prev = NULL;
-  mi_page_set_expire(page,0);
-}
-
-static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) {
-  if (segment->memid.is_pinned) return; // never reset in huge OS pages
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* page = &segment->pages[i];
-    if (!page->segment_in_use) {
-      mi_page_purge_remove(page, tld);
-      if (force_purge && page->is_committed) {
-        mi_page_purge(segment, page, tld);
-      }
-    }
-    else {
-      mi_assert_internal(mi_page_not_in_queue(page,tld));
-    }
-  }
-}
-
-static void mi_pages_try_purge(bool force, mi_segments_tld_t* tld) {
-  if (mi_option_get(mi_option_purge_delay) < 0) return;  // purging is not allowed
-
-  mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_purge;
-  // from oldest up to the first that has not expired yet
-  mi_page_t* page = pq->last;
-  while (page != NULL && (force || mi_page_purge_is_expired(page,now))) {
-    mi_page_t* const prev = page->prev; // save previous field
-    mi_page_purge_remove(page, tld);    // remove from the list to maintain invariant for mi_page_purge
-    mi_page_purge(_mi_page_segment(page), page, tld);
-    page = prev;
-  }
-  // discard the reset pages from the queue
-  pq->last = page;
-  if (page != NULL){
-    page->next = NULL;
-  }
-  else {
-    pq->first = NULL;
-  }
-}
-
-
-/* -----------------------------------------------------------
- Segment size calculations
------------------------------------------------------------ */
-
-static size_t mi_segment_raw_page_size(const mi_segment_t* segment) {
-  return (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
-}
-
-// Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-// The raw start is not taking aligned block allocation into consideration.
-static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  size_t   psize = mi_segment_raw_page_size(segment);
-  uint8_t* p = (uint8_t*)segment + page->segment_idx * psize;
-
-  if (page->segment_idx == 0) {
-    // the first page starts after the segment info (and possible guard page)
-    p += segment->segment_info_size;
-    psize -= segment->segment_info_size;
-  }
-
-#if (MI_SECURE > 1)  // every page has an os guard page
-  psize -= _mi_os_page_size();
-#elif (MI_SECURE==1) // the last page has an os guard page at the end
-  if (page->segment_idx == segment->capacity - 1) {
-    psize -= _mi_os_page_size();
-  }
-#endif
-
-  if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
-  return p;
-}
-
-// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
-{
-  size_t   psize;
-  uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
-  const size_t block_size = mi_page_block_size(page);
-  if (/*page->segment_idx == 0 &&*/ block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
-    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
-    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
-    size_t adjust = block_size - ((uintptr_t)p % block_size);
-    if (adjust < block_size && psize >= block_size + adjust) {
-      p += adjust;
-      psize -= adjust;
-      mi_assert_internal((uintptr_t)p % block_size == 0);
-    }
-  }
-  mi_assert_internal(_mi_is_aligned(p, MI_MAX_ALIGN_SIZE));
-  mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(p,block_size));
-
-  if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(_mi_ptr_page(p) == page);
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
-  return p;
-}
-
-
-static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
-{
-  const size_t minsize = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
-  size_t guardsize = 0;
-  size_t isize     = 0;
-
-
-  if (MI_SECURE == 0) {
-    // normally no guard pages
-    #if MI_GUARDED
-    isize = _mi_align_up(minsize, _mi_os_page_size());
-    #else
-    isize = _mi_align_up(minsize, 16 * MI_MAX_ALIGN_SIZE);
-    #endif
-  }
-  else {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data (and one at the end of the segment)
-    const size_t page_size = _mi_os_page_size();
-    isize = _mi_align_up(minsize, page_size);
-    guardsize = page_size;
-    //required = _mi_align_up(required, isize + guardsize);
-  }
-
-  if (info_size != NULL) *info_size = isize;
-  if (pre_size != NULL)  *pre_size  = isize + guardsize;
-  return (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + 2*guardsize, MI_PAGE_HUGE_ALIGN) );
-}
-
-
-/* ----------------------------------------------------------------------------
-Segment caches
-We keep a small segment cache per thread to increase local
-reuse and avoid setting/clearing guard pages in secure mode.
-------------------------------------------------------------------------------- */
-
-static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
-  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
-                  else _mi_stat_decrease(&tld->stats->segments,1);
-  tld->count += (segment_size >= 0 ? 1 : -1);
-  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
-  tld->current_size += segment_size;
-  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
-}
-
-static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
-  segment->thread_id = 0;
-  _mi_segment_map_freed_at(segment);
-  mi_segments_track_size(-((long)segment_size),tld);
-  if (segment->was_reclaimed) {
-    tld->reclaim_count--;
-    segment->was_reclaimed = false;
-  }
-
-  if (MI_SECURE != 0) {
-    mi_assert_internal(!segment->memid.is_pinned);
-    mi_segment_protect(segment, false, tld->os); // ensure no more guard pages are set
-  }
-
-  bool fully_committed = true;
-  size_t committed_size = 0;
-  const size_t page_size = mi_segment_raw_page_size(segment);
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* page = &segment->pages[i];
-    if (page->is_committed)  { committed_size += page_size;  }
-    if (!page->is_committed) { fully_committed = false; }
-  }
-  MI_UNUSED(fully_committed);
-  mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-
-  _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
-}
-
-// called from `heap_collect`.
-void _mi_segments_collect(bool force, mi_segments_tld_t* tld) {
-  mi_pages_try_purge(force,tld);
-  #if MI_DEBUG>=2
-  if (!_mi_is_main_thread()) {
-    mi_assert_internal(tld->pages_purge.first == NULL);
-    mi_assert_internal(tld->pages_purge.last == NULL);
-  }
-  #endif
-}
-
-
-/* -----------------------------------------------------------
-   Segment allocation
------------------------------------------------------------ */
-
-static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignment, mi_arena_id_t req_arena_id,
-                                         size_t pre_size, size_t info_size, bool commit, size_t segment_size,
-                                         mi_segments_tld_t* tld, mi_os_tld_t* tld_os)
-{
-  mi_memid_t memid;
-  bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
-  size_t align_offset = 0;
-  size_t alignment = MI_SEGMENT_SIZE;
-  if (page_alignment > 0) {
-    alignment = page_alignment;
-    align_offset = _mi_align_up(pre_size, MI_SEGMENT_SIZE);
-    segment_size = segment_size + (align_offset - pre_size);  // adjust the segment size
-  }
-
-  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, tld_os);
-  if (segment == NULL) {
-    return NULL;  // failed to allocate
-  }
-
-  if (!memid.initially_committed) {
-    // ensure the initial info is committed
-    mi_assert_internal(!memid.is_pinned);
-    bool ok = _mi_os_commit(segment, pre_size, NULL, tld_os->stats);
-    if (!ok) {
-      // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
-      _mi_arena_free(segment, segment_size, 0, memid, tld_os->stats);
-      return NULL;
-    }
-  }
-
-  MI_UNUSED(info_size);
-  segment->memid = memid;
-  segment->allow_decommit = !memid.is_pinned;
-  segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
-  segment->segment_size = segment_size;
-  segment->subproc = tld->subproc;
-  mi_segments_track_size((long)(segment_size), tld);
-  _mi_segment_map_allocated_at(segment);
-  return segment;
-}
-
-// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, size_t page_alignment,
-                                      mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  // required is only > 0 for huge page allocations
-  mi_assert_internal((required > 0 && page_kind > MI_PAGE_LARGE)|| (required==0 && page_kind <= MI_PAGE_LARGE));
-
-  // calculate needed sizes first
-  size_t capacity;
-  if (page_kind == MI_PAGE_HUGE) {
-    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT + 1 && required > 0);
-    capacity = 1;
-  }
-  else {
-    mi_assert_internal(required == 0 && page_alignment == 0);
-    size_t page_size = (size_t)1 << page_shift;
-    capacity = MI_SEGMENT_SIZE / page_size;
-    mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
-    mi_assert_internal(capacity >= 1 && capacity <= MI_SMALL_PAGES_PER_SEGMENT);
-  }
-  size_t info_size;
-  size_t pre_size;
-  const size_t init_segment_size = mi_segment_calculate_sizes(capacity, required, &pre_size, &info_size);
-  mi_assert_internal(init_segment_size >= required);
-
-  // Initialize parameters
-  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM &&          // don't delay for large objects
-                              // !_mi_os_has_overcommit() &&          // never delay on overcommit systems
-                              _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
-                              tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
-  const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE);
-
-  // Allocate the segment from the OS (segment_size can change due to alignment)
-  mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld, os_tld);
-  if (segment == NULL) return NULL;
-  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-  mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
-
-  // zero the segment info (but not the `mem` fields)
-  ptrdiff_t ofs = offsetof(mi_segment_t, next);
-  _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
-
-  // initialize pages info
-  const bool is_huge = (page_kind == MI_PAGE_HUGE);
-  for (size_t i = 0; i < capacity; i++) {
-    mi_assert_internal(i <= 255);
-    segment->pages[i].segment_idx = (uint8_t)i;
-    segment->pages[i].is_committed = segment->memid.initially_committed;
-    segment->pages[i].is_zero_init = segment->memid.initially_zero;
-    segment->pages[i].is_huge = is_huge;
-  }
-
-  // initialize
-  segment->page_kind  = page_kind;
-  segment->capacity   = capacity;
-  segment->page_shift = page_shift;
-  segment->segment_info_size = pre_size;
-  segment->thread_id  = _mi_thread_id();
-  segment->cookie     = _mi_ptr_cookie(segment);
-
-  // set protection
-  mi_segment_protect(segment, true, tld->os);
-
-  // insert in free lists for small and medium pages
-  if (page_kind <= MI_PAGE_MEDIUM) {
-    mi_segment_insert_in_free_queue(segment, tld);
-  }
-
-  return segment;
-}
-
-
-static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  MI_UNUSED(force);
-  mi_assert(segment != NULL);
-  
-  // in `mi_segment_force_abandon` we set this to true to ensure the segment's memory stays valid
-  if (segment->dont_free) return;
-
-  // don't purge as we are freeing now
-  mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld);
-  mi_segment_remove_from_free_queue(segment, tld);
-
-  mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
-  mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
-  mi_assert(segment->next == NULL);
-  mi_assert(segment->prev == NULL);
-  _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);
-
-  // return it to the OS
-  mi_segment_os_free(segment, segment->segment_size, tld);
-}
-
-/* -----------------------------------------------------------
-  Free page management inside a segment
------------------------------------------------------------ */
-
-
-static bool mi_segment_has_free(const mi_segment_t* segment) {
-  return (segment->used < segment->capacity);
-}
-
-static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(_mi_page_segment(page) == segment);
-  mi_assert_internal(!page->segment_in_use);
-  mi_page_purge_remove(page, tld);
-
-  // check commit
-  if (!mi_page_ensure_committed(segment, page, tld)) return false;
-
-  // set in-use before doing unreset to prevent delayed reset
-  page->segment_in_use = true;
-  segment->used++;
-  mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld));
-  mi_assert_internal(segment->used <= segment->capacity);
-  if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) {
-    // if no more free pages, remove from the queue
-    mi_assert_internal(!mi_segment_has_free(segment));
-    mi_segment_remove_from_free_queue(segment, tld);
-  }
-  return true;
-}
-
-
-/* -----------------------------------------------------------
-   Free
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
-
-// clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld)
-{
-  mi_assert_internal(page->segment_in_use);
-  mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(page->is_committed);
-  mi_assert_internal(mi_page_not_in_queue(page, tld));
-
-  size_t inuse = page->capacity * mi_page_block_size(page);
-  _mi_stat_decrease(&tld->stats->page_committed, inuse);
-  _mi_stat_decrease(&tld->stats->pages, 1);
-
-  page->is_zero_init = false;
-  page->segment_in_use = false;
-
-  // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations)
-  size_t block_size = page->block_size;
-  uint8_t block_size_shift = page->block_size_shift;
-  uint8_t heap_tag = page->heap_tag;
-  uint8_t* page_start = page->page_start;
-  uint16_t capacity = page->capacity;
-  uint16_t reserved = page->reserved;
-  ptrdiff_t ofs = offsetof(mi_page_t,capacity);
-  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
-  page->capacity = capacity;
-  page->reserved = reserved;
-  page->block_size = block_size;
-  page->block_size_shift = block_size_shift;
-  page->heap_tag = heap_tag;
-  page->page_start = page_start;
-  segment->used--;
-
-  // schedule purge
-  mi_segment_schedule_purge(segment, page, tld);
-
-  page->capacity = 0;  // after purge these can be zero'd now
-  page->reserved = 0;
-}
-
-void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
-{
-  mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  mi_pages_try_purge(false /*force?*/, tld);
-
-  // mark it as free now
-  mi_segment_page_clear(segment, page, tld);
-
-  if (segment->used == 0) {
-    // no more used pages; remove from the free list and free the segment
-    mi_segment_free(segment, force, tld);
-  }
-  else {
-    if (segment->used == segment->abandoned) {
-      // only abandoned pages; remove from free list and abandon
-      mi_segment_abandon(segment,tld);
-    }
-    else if (segment->used + 1 == segment->capacity) {
-      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment
-      if (segment->page_kind <= MI_PAGE_MEDIUM) {
-        // move back to segments  free list
-        mi_segment_insert_in_free_queue(segment,tld);
-      }
-    }
-  }
-}
-
-
-/* -----------------------------------------------------------
-Abandonment
-
-When threads terminate, they can leave segments with
-live blocks (reached through other threads). Such segments
-are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually. The
-`thread_id` of such segments is 0.
-
-When a block is freed in an abandoned segment, the segment
-is reclaimed into that thread.
-
-Moreover, if threads are looking for a fresh segment, they
-will first consider abondoned segments -- these can be found
-by scanning the arena memory
-(segments outside arena memoryare only reclaimed by a free).
------------------------------------------------------------ */
-
-/* -----------------------------------------------------------
-   Abandon segment/page
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_assert_internal(segment->used > 0);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-
-  // Potentially force purge. Only abandoned segments in arena memory can be
-  // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative.
-  mi_pages_try_purge(false /*force?*/,tld);
-  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) ||  mi_option_is_enabled(mi_option_abandoned_page_purge);
-  mi_segment_remove_all_purges(segment, force_purge, tld);
-
-  // remove the segment from the free page queue if needed
-  mi_segment_remove_from_free_queue(segment, tld);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-
-  // all pages in the segment are abandoned; add it to the abandoned list
-  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
-  mi_segments_track_size(-((long)segment->segment_size), tld);
-  segment->abandoned_visits = 0;
-  if (segment->was_reclaimed) {
-    tld->reclaim_count--;
-    segment->was_reclaimed = false;
-  }
-  _mi_arena_segment_mark_abandoned(segment);
-}
-
-void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert(page != NULL);
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  segment->abandoned++;
-  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  if (segment->used == segment->abandoned) {
-    // all pages are abandoned, abandon the entire segment
-    mi_segment_abandon(segment, tld);
-  }
-}
-
-/* -----------------------------------------------------------
-  Reclaim abandoned pages
------------------------------------------------------------ */
-
-// Possibly clear pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
-{
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  bool has_page = false;
-  size_t pages_used = 0;
-  size_t pages_used_empty = 0;
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {
-      pages_used++;
-      // ensure used count is up to date and collect potential concurrent frees
-      _mi_page_free_collect(page, false);
-      if (mi_page_all_free(page)) {
-        // if everything free already, page can be reused for some block size
-        // note: don't clear the page yet as we can only OS reset it once it is reclaimed
-        pages_used_empty++;
-        has_page = true;
-      }
-      else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
-        // a page has available free blocks of the right size
-        has_page = true;
-      }
-    }
-    else {
-      // whole empty page
-      has_page = true;
-    }
-  }
-  mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
-  if (all_pages_free != NULL) {
-    *all_pages_free = ((pages_used - pages_used_empty) == 0);
-  }
-  return has_page;
-}
-
-
-// Reclaim a segment; returns NULL if the segment was freed
-// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
-static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-  // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
-  mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess
-  mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  segment->abandoned_visits = 0;
-  segment->was_reclaimed = true;
-  tld->reclaim_count++;
-  mi_segments_track_size((long)segment->segment_size, tld);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
-
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {
-      mi_assert_internal(page->is_committed);
-      mi_assert_internal(mi_page_not_in_queue(page, tld));
-      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-      mi_assert_internal(mi_page_heap(page) == NULL);
-      segment->abandoned--;
-      mi_assert(page->next == NULL);
-      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-      // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap)
-      mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
-      if (target_heap == NULL) {
-        target_heap = heap;
-        _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag );
-      }
-      // associate the heap with this page, and allow heap thread delayed free again.
-      mi_page_set_heap(page, target_heap);
-      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
-      _mi_page_free_collect(page, false); // ensure used count is up to date
-      if (mi_page_all_free(page)) {
-        // if everything free already, clear the page directly
-        mi_segment_page_clear(segment, page, tld);  // reset is ok now
-      }
-      else {
-        // otherwise reclaim it into the heap
-        _mi_page_reclaim(target_heap, page);
-        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) {
-          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
-        }
-      }
-    }
-    /* expired
-    else if (page->is_committed) {  // not in-use, and not reset yet
-      // note: do not reset as this includes pages that were not touched before
-      // mi_pages_purge_add(segment, page, tld);
-    }
-    */
-  }
-  mi_assert_internal(segment->abandoned == 0);
-  if (segment->used == 0) {
-    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
-    mi_segment_free(segment, false, tld);
-    return NULL;
-  }
-  else {
-    if (segment->page_kind <= MI_PAGE_MEDIUM && mi_segment_has_free(segment)) {
-      mi_segment_insert_in_free_queue(segment, tld);
-    }
-    return segment;
-  }
-}
-
-
-// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
-bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
-  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
-  if (segment->subproc != heap->tld->segments.subproc)  return false;  // only reclaim within the same subprocess
-  if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false;  // don't reclaim between exclusive and non-exclusive arena's
-  const long target = _mi_option_get_fast(mi_option_target_segments_per_thread);
-  if (target > 0 && (size_t)target <= heap->tld->segments.count) return false; // don't reclaim if going above the target count
-
-  // don't reclaim more from a `free` call than half the current segments
-  // this is to prevent a pure free-ing thread to start owning too many segments
-  // (but not for out-of-arena segments as that is the main way to be reclaimed for those)
-  if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) {
-    return false;
-  }
-  if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
-    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
-    mi_assert_internal(res == segment);
-    return (res != NULL);
-  }
-  return false;
-}
-
-void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(heap, tld->subproc, true /* visit all, blocking */, &current);
-  while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    mi_segment_reclaim(segment, heap, 0, NULL, tld);
-  }
-  _mi_arena_field_cursor_done(&current);
-}
-
-
-static bool segment_count_is_within_target(mi_segments_tld_t* tld, size_t* ptarget) {
-  const size_t target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 0, 1024);
-  if (ptarget != NULL) { *ptarget = target; }
-  return (target == 0 || tld->count < target);
-}
-
-static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) {
-  // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
-  const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
-  if (perc <= 0) return 0;
-  const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count);
-  if (total_count == 0) return 0;
-  const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
-  long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
-  if (max_tries < 8 && total_count > 8) { max_tries = 8;  }
-  return max_tries;
-}
-
-static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
-{
-  *reclaimed = false;
-  long max_tries = mi_segment_get_reclaim_tries(tld);
-  if (max_tries <= 0) return NULL;
-
-  mi_segment_t* result = NULL;
-  mi_segment_t* segment = NULL;
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(heap, tld->subproc, false /* non-blocking */, &current);
-  while (segment_count_is_within_target(tld,NULL) && (max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
-  {
-    mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process
-    segment->abandoned_visits++;
-    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
-    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
-    // Perhaps we can skip non-suitable ones in a better way?
-    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
-    bool all_pages_free;
-    bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees)
-    if (all_pages_free) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note1: we prefer to free a segment as that might lead to reclaiming another
-      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else if (has_page && segment->page_kind == page_kind && is_suitable) {
-      // found a free page of the right kind, or page of the right block_size with free space
-      // we return the result of reclaim (which is usually `segment`) as it might free
-      // the segment due to concurrent frees (in which case `NULL` is returned).
-      result = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
-      break;
-    }
-    else if (segment->abandoned_visits > 3 && is_suitable) {
-      // always reclaim on 3rd visit to limit the abandoned segment count.
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, mark it back as abandoned
-      // todo: reset delayed pages in the segment?
-      _mi_arena_segment_mark_abandoned(segment);
-    }
-  }
-  _mi_arena_field_cursor_done(&current);
-  return result;
-}
-
-
-/* -----------------------------------------------------------
-  Force abandon a segment that is in use by our thread
------------------------------------------------------------ */
-
-// force abandon a segment
-static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* tld)
-{
-  mi_assert_internal(segment->abandoned < segment->used);
-  mi_assert_internal(!segment->dont_free);
-  
-  // ensure the segment does not get free'd underneath us (so we can check if a page has been freed in `mi_page_force_abandon`)
-  segment->dont_free = true;
-
-  // for all pages
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {
-      // abandon the page if it is still in-use (this will free the page if possible as well (but not our segment))
-      mi_assert_internal(segment->used > 0);
-      if (segment->used == segment->abandoned+1) {
-        // the last page.. abandon and return as the segment will be abandoned after this
-        // and we should no longer access it.
-        segment->dont_free = false;
-        _mi_page_force_abandon(page);
-        return;
-      }
-      else {
-        // abandon and continue
-        _mi_page_force_abandon(page);
-      }
-    }
-  }
-  segment->dont_free = false;
-  mi_assert(segment->used == segment->abandoned);
-  mi_assert(segment->used == 0);
-  if (segment->used == 0) {  // paranoia
-    // all free now
-    mi_segment_free(segment, false, tld);
-  }
-  else {
-    // perform delayed purges
-    mi_pages_try_purge(false /* force? */, tld);
-  }
-}
-
-
-// try abandon segments.
-// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use.
-static void mi_segments_try_abandon_to_target(mi_heap_t* heap, size_t target, mi_segments_tld_t* tld) {
-  if (target <= 1) return;
-  const size_t min_target = (target > 4 ? (target*3)/4 : target);  // 75%
-  // todo: we should maintain a list of segments per thread; for now, only consider segments from the heap full pages
-  for (int i = 0; i < 64 && tld->count >= min_target; i++) {
-    mi_page_t* page = heap->pages[MI_BIN_FULL].first;
-    while (page != NULL && mi_page_is_huge(page)) {
-      page = page->next;
-    }
-    if (page==NULL) {
-      break;
-    }
-    mi_segment_t* segment = _mi_page_segment(page);
-    mi_segment_force_abandon(segment, tld);
-    mi_assert_internal(page != heap->pages[MI_BIN_FULL].first); // as it is just abandoned
-  }
-}
-
-// try abandon segments.
-// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use.
-static void mi_segments_try_abandon(mi_heap_t* heap, mi_segments_tld_t* tld) {
-  // we call this when we are about to add a fresh segment so we should be under our target segment count.
-  size_t target = 0;
-  if (segment_count_is_within_target(tld, &target)) return;
-  mi_segments_try_abandon_to_target(heap, target, tld);
-}
-
-void mi_collect_reduce(size_t target_size) mi_attr_noexcept {
-  mi_collect(true);
-  mi_heap_t* heap = mi_heap_get_default();
-  mi_segments_tld_t* tld = &heap->tld->segments;
-  size_t target = target_size / MI_SEGMENT_SIZE;
-  if (target == 0) {
-    target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 1, 1024);
-  }
-  mi_segments_try_abandon_to_target(heap, target, tld);
-}
-
-/* -----------------------------------------------------------
-   Reclaim or allocate
------------------------------------------------------------ */
-
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
-  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
-
-  // try to abandon some segments to increase reuse between threads
-  mi_segments_try_abandon(heap,tld);
-
-  // 1. try to reclaim an abandoned segment
-  bool reclaimed;
-  mi_segment_t* segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
-  mi_assert_internal(segment == NULL || _mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
-  if (reclaimed) {
-    // reclaimed the right page right into the heap
-    mi_assert_internal(segment != NULL && segment->page_kind == page_kind && page_kind <= MI_PAGE_LARGE);
-    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
-  }
-  else if (segment != NULL) {
-    // reclaimed a segment with empty pages (of `page_kind`) in it
-    return segment;
-  }
-  // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, 0, heap->arena_id, tld, os_tld);
-}
-
-
-/* -----------------------------------------------------------
-   Small page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_segment_has_free(segment));
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  for (size_t i = 0; i < segment->capacity; i++) {  // TODO: use a bitmap instead of search?
-    mi_page_t* page = &segment->pages[i];
-    if (!page->segment_in_use) {
-      bool ok = mi_segment_page_claim(segment, page, tld);
-      if (ok) return page;
-    }
-  }
-  mi_assert(false);
-  return NULL;
-}
-
-// Allocate a page inside a segment. Requires that the page has free pages
-static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_segment_has_free(segment));
-  return mi_segment_find_free(segment, tld);
-}
-
-static mi_page_t* mi_segment_page_try_alloc_in_queue(mi_heap_t* heap, mi_page_kind_t kind, mi_segments_tld_t* tld) {
-  // find an available segment the segment free queue
-  mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
-  for (mi_segment_t* segment = free_queue->first; segment != NULL; segment = segment->next) {
-    if (_mi_arena_memid_is_suitable(segment->memid, heap->arena_id) && mi_segment_has_free(segment)) {
-      return mi_segment_page_alloc_in(segment, tld);
-    }
-  }
-  return NULL;
-}
-
-static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  mi_page_t* page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);
-  if (page == NULL) {
-    // possibly allocate or reclaim a fresh segment
-    mi_segment_t* const segment = mi_segment_reclaim_or_alloc(heap, block_size, kind, page_shift, tld, os_tld);
-    if (segment == NULL) return NULL;  // return NULL if out-of-memory (or reclaimed)
-    mi_assert_internal(segment->page_kind==kind);
-    mi_assert_internal(segment->used < segment->capacity);
-    mi_assert_internal(_mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
-    page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);  // this should now succeed
-  }
-  mi_assert_internal(page != NULL);
-  #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  // verify it is committed
-  mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0;
-  #endif
-  return page;
-}
-
-static mi_page_t* mi_segment_small_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  return mi_segment_page_alloc(heap, block_size, MI_PAGE_SMALL,MI_SMALL_PAGE_SHIFT,tld,os_tld);
-}
-
-static mi_page_t* mi_segment_medium_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  return mi_segment_page_alloc(heap, block_size, MI_PAGE_MEDIUM, MI_MEDIUM_PAGE_SHIFT, tld, os_tld);
-}
-
-/* -----------------------------------------------------------
-   large page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  mi_segment_t* segment = mi_segment_reclaim_or_alloc(heap,block_size,MI_PAGE_LARGE,MI_LARGE_PAGE_SHIFT,tld,os_tld);
-  if (segment == NULL) return NULL;
-  mi_page_t* page = mi_segment_find_free(segment, tld);
-  mi_assert_internal(page != NULL);
-#if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  mi_segment_raw_page_start(segment, page, NULL)[0] = 0;
-#endif
-  return page;
-}
-
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT + 1, page_alignment, req_arena_id, tld, os_tld);
-  if (segment == NULL) return NULL;
-  mi_assert_internal(mi_segment_page_size(segment) - segment->segment_info_size - (2*(MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= size);
-  #if MI_HUGE_PAGE_ABANDON
-  segment->thread_id = 0; // huge pages are immediately abandoned
-  mi_segments_track_size(-(long)segment->segment_size, tld);
-  #endif
-  mi_page_t* page = mi_segment_find_free(segment, tld);
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(page->is_huge);
-
-  // for huge pages we initialize the block_size as we may
-  // overallocate to accommodate large alignments.
-  size_t psize;
-  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
-  page->block_size = psize;
-
-  // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
-  if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
-    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
-    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
-    mi_assert_internal(psize - (aligned_p - start) >= size);
-    uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list
-    ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_reset(decommit_start, decommit_size, os_tld->stats);  // do not decommit as it may be in a region
-  }
-
-  return page;
-}
-
-#if MI_HUGE_PAGE_ABANDON
-// free huge block from another thread
-void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  // huge page segments are always abandoned and can be freed immediately by any thread
-  mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
-
-  // claim it and free
-  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
-  // paranoia: if this it the last reference, the cas should always succeed
-  size_t expected_tid = 0;
-  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
-    mi_block_set_next(page, block, page->free);
-    page->free = block;
-    page->used--;
-    page->is_zero_init = false;
-    mi_assert(page->used == 0);
-    mi_tld_t* tld = heap->tld;
-    mi_segments_track_size((long)segment->segment_size, &tld->segments);
-    _mi_segment_page_free(page, true, &tld->segments);
-  }
-#if (MI_DEBUG!=0)
-  else {
-    mi_assert_internal(false);
-  }
-#endif
-}
-
-#else
-// reset memory of a huge block from another thread
-void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(page->used == 1); // this is called just before the free
-  mi_assert_internal(page->free == NULL);
-  if (segment->allow_decommit && page->is_committed) {
-    size_t usize = mi_usable_size(block);
-    if (usize > sizeof(mi_block_t)) {
-      usize = usize - sizeof(mi_block_t);
-      uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
-      _mi_os_reset(p, usize, &_mi_stats_main);
-    }
-  }
-}
-#endif
-
-/* -----------------------------------------------------------
-   Page allocation
------------------------------------------------------------ */
-
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    mi_assert_internal(_mi_is_power_of_two(page_alignment));
-    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
-    //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0);
-    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
-    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld);
-  }
-  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
-    page = mi_segment_small_page_alloc(heap, block_size, tld, os_tld);
-  }
-  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    page = mi_segment_medium_page_alloc(heap, block_size, tld, os_tld);
-  }
-  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX /* || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t)) */ ) {
-    page = mi_segment_large_page_alloc(heap, block_size, tld, os_tld);
-  }
-  else {
-    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld);
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  // mi_segment_try_purge(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
-  return page;
-}
-
-
-/* -----------------------------------------------------------
-   Visit blocks in a segment (only used for abandoned segments)
------------------------------------------------------------ */
-
-static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  mi_heap_area_t area;
-  _mi_heap_area_init(&area, page);
-  if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false;
-  if (visit_blocks) {
-    return _mi_heap_area_visit_blocks(&area, page, visitor, arg);
-  }
-  else {
-    return true;
-  }
-}
-
-bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  for (size_t i = 0; i < segment->capacity; i++) {
-    mi_page_t* const page = &segment->pages[i];
-    if (page->segment_in_use) {
-      if (heap_tag < 0 || (int)page->heap_tag == heap_tag) {
-        if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false;
-      }
-    }
-  }
-  return true;
-}

From 2084df3dde95aa2fb2bb73e8fc3eff2f7edc6662 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 12:20:54 -0800
Subject: [PATCH 049/264] add dedicated meta data allocation for threads and
 tld

---
 CMakeLists.txt                               |   1 +
 ide/vs2022/mimalloc-override.vcxproj         |   1 +
 ide/vs2022/mimalloc-override.vcxproj.filters |   1 +
 ide/vs2022/mimalloc.vcxproj                  |   1 +
 ide/vs2022/mimalloc.vcxproj.filters          |   1 +
 include/mimalloc/internal.h                  | 251 +++++++++----------
 include/mimalloc/types.h                     |  32 ++-
 src/arena-meta.c                             | 156 ++++++++++++
 src/arena.c                                  |  90 +++----
 src/heap.c                                   |  50 ++--
 src/init.c                                   | 184 ++++++--------
 src/os.c                                     | 133 +++++-----
 src/page-map.c                               |   6 +-
 src/prim/windows/prim.c                      |  14 +-
 src/static.c                                 |   1 +
 15 files changed, 511 insertions(+), 411 deletions(-)
 create mode 100644 src/arena-meta.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e47cfe6..6df4ba5a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,7 @@ set(mi_sources
     src/alloc-aligned.c
     src/alloc-posix.c
     src/arena.c
+    src/arena-meta.c
     src/bitmap.c
     src/heap.c
     src/init.c
diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj
index a5d5c34c..eebc4d8a 100644
--- a/ide/vs2022/mimalloc-override.vcxproj
+++ b/ide/vs2022/mimalloc-override.vcxproj
@@ -236,6 +236,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena-meta.c" />
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c" />
     <ClCompile Include="..\..\src\free.c">
diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters
index 60c7a1fb..0e63822c 100644
--- a/ide/vs2022/mimalloc-override.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override.vcxproj.filters
@@ -58,6 +58,7 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\mimalloc\atomic.h">
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index e9a4a339..d8cc25b1 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -214,6 +214,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena-meta.c" />
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index a47efddd..7fc4ba9c 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,6 +58,7 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 28eca4bb..4c8256a0 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -27,8 +27,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
-#pragma warning(disable:28159)  // don't use GetVersion
-#pragma warning(disable:4996)   // don't use GetVersion
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_align(a)        __declspec(align(a))
@@ -58,42 +56,52 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_externc
 #endif
 
+// "libc.c"
+#include    <stdarg.h>
+void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char        _mi_toupper(char c);
+int         _mi_strnicmp(const char* s, const char* t, size_t n);
+void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t      _mi_strlen(const char* s);
+size_t      _mi_strnlen(const char* s, size_t max_len);
+bool        _mi_getenv(const char* name, char* result, size_t result_size);
 
 // "options.c"
-void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
-void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
-void       _mi_warning_message(const char* fmt, ...);
-void       _mi_verbose_message(const char* fmt, ...);
-void       _mi_trace_message(const char* fmt, ...);
-void       _mi_output_message(const char* fmt, ...);
-void       _mi_options_init(void);
-long       _mi_option_get_fast(mi_option_t option);
-void       _mi_error_message(int err, const char* fmt, ...);
+void        _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void        _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void        _mi_warning_message(const char* fmt, ...);
+void        _mi_verbose_message(const char* fmt, ...);
+void        _mi_trace_message(const char* fmt, ...);
+void        _mi_output_message(const char* fmt, ...);
+void        _mi_options_init(void);
+long        _mi_option_get_fast(mi_option_t option);
+void        _mi_error_message(int err, const char* fmt, ...);
 
 // random.c
-void       _mi_random_init(mi_random_ctx_t* ctx);
-void       _mi_random_init_weak(mi_random_ctx_t* ctx);
-void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
-void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
-uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
-uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+void        _mi_random_init(mi_random_ctx_t* ctx);
+void        _mi_random_init_weak(mi_random_ctx_t* ctx);
+void        _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void        _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t   _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t   _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
 extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
-void       _mi_process_load(void);
+void        _mi_process_load(void);
 void mi_cdecl _mi_process_done(void);
-bool       _mi_is_redirected(void);
-bool       _mi_allocator_init(const char** message);
-void       _mi_allocator_done(void);
-bool       _mi_is_main_thread(void);
-size_t     _mi_current_thread_count(void);
-bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
-void       _mi_thread_done(mi_heap_t* heap);
-void       _mi_thread_data_collect(void);
-void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+bool        _mi_is_redirected(void);
+bool        _mi_allocator_init(const char** message);
+void        _mi_allocator_done(void);
+bool        _mi_is_main_thread(void);
+size_t      _mi_current_thread_count(void);
+bool        _mi_preloading(void);           // true while the C runtime is not initialized yet
+void        _mi_thread_done(mi_heap_t* heap);
+mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
@@ -103,116 +111,94 @@ mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 void          _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
-void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
-void*      _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
-void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
-void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
+void        _mi_os_init(void);                                            // called from process init
+void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
+void*       _mi_os_zalloc(size_t size, mi_memid_t* memid);
+void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
+void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
 
-size_t     _mi_os_page_size(void);
-size_t     _mi_os_good_alloc_size(size_t size);
-bool       _mi_os_has_overcommit(void);
-bool       _mi_os_has_virtual_reserve(void);
-size_t     _mi_os_virtual_address_bits(void);
+size_t      _mi_os_page_size(void);
+size_t      _mi_os_good_alloc_size(size_t size);
+bool        _mi_os_has_overcommit(void);
+bool        _mi_os_has_virtual_reserve(void);
+size_t      _mi_os_virtual_address_bits(void);
 
-bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
-bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool       _mi_os_protect(void* addr, size_t size);
-bool       _mi_os_unprotect(void* addr, size_t size);
-bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
+bool        _mi_os_reset(void* addr, size_t size);
+bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
+bool        _mi_os_decommit(void* addr, size_t size);
+bool        _mi_os_protect(void* addr, size_t size);
+bool        _mi_os_unprotect(void* addr, size_t size);
+bool        _mi_os_purge(void* p, size_t size);
+bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset);
 
-void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
-void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
+void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
+void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
 
-void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-bool       _mi_os_use_large_page(size_t size, size_t alignment);
-size_t     _mi_os_large_page_size(void);
+void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool        _mi_os_use_large_page(size_t size, size_t alignment);
+size_t      _mi_os_large_page_size(void);
 
-void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void       _mi_arena_init(void);
-void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
-void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld);
-bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
-bool       _mi_arena_contains(const void* p);
-void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
-void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
+void        _mi_arena_init(void);
+void        _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid);
+void*       _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
+void*       _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
+bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+bool        _mi_arena_contains(const void* p);
+void        _mi_arenas_collect(bool force_purge);
+void        _mi_arena_unsafe_destroy_all(void);
 
-mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
-void       _mi_arena_page_free(mi_page_t* page);
-void       _mi_arena_page_abandon(mi_page_t* page);
-void       _mi_arena_page_unabandon(mi_page_t* page);
-bool       _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page);
+mi_page_t*  _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
+void        _mi_arena_page_free(mi_page_t* page);
+void        _mi_arena_page_abandon(mi_page_t* page);
+void        _mi_arena_page_unabandon(mi_page_t* page);
+bool        _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page);
 
-bool       _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page);
-void       _mi_arena_reclaim_all_abandoned(mi_heap_t* heap);
-
-void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
-void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
-
-/*
-typedef struct mi_arena_field_cursor_s { // abstract struct
-  size_t         os_list_count;           // max entries to visit in the OS abandoned list
-  size_t         start;                   // start arena idx (may need to be wrapped)
-  size_t         end;                     // end arena idx (exclusive, may need to be wrapped)
-  size_t         bitmap_idx;              // current bit idx for an arena
-  mi_subproc_t*  subproc;                 // only visit blocks in this sub-process
-  bool           visit_all;               // ensure all abandoned blocks are seen (blocking)
-  bool           hold_visit_lock;         // if the subproc->abandoned_os_visit_lock is held
-} mi_arena_field_cursor_t;
-void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current);
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
-void          _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current);
-*/
+// arena-meta.c
+void*       _mi_meta_zalloc( size_t size, mi_memid_t* memid );
+void        _mi_meta_free(void* p, size_t size, mi_memid_t memid);
 
 // "page-map.c"
-bool       _mi_page_map_init(void);
-void       _mi_page_map_register(mi_page_t* page);
-void       _mi_page_map_unregister(mi_page_t* page);
-
+bool        _mi_page_map_init(void);
+void        _mi_page_map_register(mi_page_t* page);
+void        _mi_page_map_unregister(mi_page_t* page);
 
 // "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
 
-void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
-void       _mi_page_unfull(mi_page_t* page);
-void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);               // free the page
-void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void       _mi_page_force_abandon(mi_page_t* page);
+void        _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void        _mi_page_unfull(mi_page_t* page);
+void        _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);               // free the page
+void        _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void        _mi_page_force_abandon(mi_page_t* page);
+void        _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
-// void       _mi_heap_delayed_free_all(mi_heap_t* heap);
-// bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
-void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+size_t      _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void        _mi_deferred_free(mi_heap_t* heap, bool force);
 
-size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
-void       _mi_deferred_free(mi_heap_t* heap, bool force);
+void        _mi_page_free_collect(mi_page_t* page,bool force);
+void        _mi_page_init(mi_heap_t* heap, mi_page_t* page);
 
-void       _mi_page_free_collect(mi_page_t* page,bool force);
-// void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
-void       _mi_page_init(mi_heap_t* heap, mi_page_t* page);
-
-size_t     _mi_bin_size(uint8_t bin);           // for stats
-uint8_t    _mi_bin(size_t size);                // for stats
+size_t      _mi_bin_size(uint8_t bin);           // for stats
+uint8_t     _mi_bin(size_t size);                // for stats
 
 // "heap.c"
-void       _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
-void       _mi_heap_destroy_pages(mi_heap_t* heap);
-void       _mi_heap_collect_abandon(mi_heap_t* heap);
-void       _mi_heap_set_default_direct(mi_heap_t* heap);
-bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
-void       _mi_heap_unsafe_destroy_all(void);
-mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
-void       _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
-bool       _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
-void       _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
+void        _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
+void        _mi_heap_destroy_pages(mi_heap_t* heap);
+void        _mi_heap_collect_abandon(mi_heap_t* heap);
+void        _mi_heap_set_default_direct(mi_heap_t* heap);
+bool        _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void        _mi_heap_unsafe_destroy_all(void);
+mi_heap_t*  _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void        _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
+void        _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
 
 // "stats.c"
-void       _mi_stats_done(mi_stats_t* stats);
+void        _mi_stats_done(mi_stats_t* stats);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@@ -226,20 +212,6 @@ void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
-// bool        _mi_free_delayed_block(mi_block_t* block);
-
-
-// "libc.c"
-#include    <stdarg.h>
-void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
-void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
-char        _mi_toupper(char c);
-int         _mi_strnicmp(const char* s, const char* t, size_t n);
-void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
-void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
-size_t      _mi_strlen(const char* s);
-size_t      _mi_strnlen(const char* s, size_t max_len);
-bool        _mi_getenv(const char* name, char* result, size_t result_size);
 
 #if MI_DEBUG>1
 bool        _mi_page_is_valid(mi_page_t* page);
@@ -449,9 +421,6 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) {
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
 
-static inline mi_tld_t* _mi_tld(void) {
-  return mi_heap_get_default()->tld;
-}
 
 /* -----------------------------------------------------------
   Pages
@@ -908,6 +877,16 @@ static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool commi
   return memid;
 }
 
+static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, size_t block_count) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_META);
+  memid.mem.meta.meta_page = mpage;
+  memid.mem.meta.block_index = (uint32_t)block_idx;
+  memid.mem.meta.block_count = (uint32_t)block_count;
+  memid.initially_committed = true;
+  memid.initially_zero = true;
+  memid.is_pinned = true;
+  return memid;
+}
 
 // -------------------------------------------------------------------
 // Fast "random" shuffle
@@ -937,13 +916,13 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
 // Optimize numa node access for the common case (= one node)
 // -------------------------------------------------------------------
 
-int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+int    _mi_os_numa_node_get(void);
 size_t _mi_os_numa_node_count_get(void);
 
 extern _Atomic(size_t) _mi_numa_node_count;
-static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+static inline int _mi_os_numa_node(void) {
   if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
-  else return _mi_os_numa_node_get(tld);
+  else return _mi_os_numa_node_get();
 }
 static inline size_t _mi_os_numa_node_count(void) {
   const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e10786a0..d0a77c5f 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -155,6 +155,7 @@ typedef enum mi_memkind_e {
   MI_MEM_NONE,      // not allocated
   MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
   MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_META,      // allocated with the meta data allocator
   MI_MEM_OS,        // allocated from the OS
   MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
   MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
@@ -165,6 +166,11 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
   return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
 }
 
+static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
+  return (memkind <= MI_MEM_STATIC);
+}
+
+
 typedef struct mi_memid_os_info {
   void*         base;               // actual base address of the block (used for offset aligned allocations)
   size_t        size;               // allocated full size
@@ -178,10 +184,17 @@ typedef struct mi_memid_arena_info {
   bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
 
+typedef struct mi_memid_meta_info {
+  void*         meta_page;          // meta-page that contains the block
+  uint32_t      block_index;        // block index in the meta-data page
+  uint32_t      block_count;        // allocated blocks
+} mi_memid_meta_info_t;
+
 typedef struct mi_memid_s {
   union {
     mi_memid_os_info_t    os;       // only used for MI_MEM_OS
     mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
   } mem;
   bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
   bool          initially_committed;// `true` if the memory was originally allocated as committed
@@ -190,6 +203,14 @@ typedef struct mi_memid_s {
 } mi_memid_t;
 
 
+static inline bool mi_memid_is_os(mi_memid_t memid) {
+  return mi_memkind_is_os(memid.memkind);
+}
+
+static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
+  return mi_memkind_needs_no_free(memid.memkind);
+}
+
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
 // ------------------------------------------------------
@@ -399,7 +420,8 @@ struct mi_heap_s {
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
-  bool                  allow_page_reclaim;                  // `true` if this heap can reclaim abandoned pages
+  mi_memid_t            memid;                               // provenance of the heap struct itseft (meta or os)
+  bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
   #if MI_GUARDED
@@ -560,12 +582,6 @@ struct mi_subproc_s {
 typedef int64_t  mi_msecs_t;
 
 
-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t                region_idx;   // start point for next allocation
-  mi_stats_t*           stats;        // points to tld stats
-} mi_os_tld_t;
-
 // Thread local data
 struct mi_tld_s {
   unsigned long long  heartbeat;        // monotonic heartbeat count
@@ -573,9 +589,9 @@ struct mi_tld_s {
   mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
   mi_subproc_t*       subproc;          // sub-process this thread belongs to.
   size_t              tseq;             // thread sequence id
+  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
   bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
   bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
-  mi_os_tld_t         os;               // os tld
   mi_stats_t          stats;            // statistics
 };
 
diff --git a/src/arena-meta.c b/src/arena-meta.c
new file mode 100644
index 00000000..0fb4dfa5
--- /dev/null
+++ b/src/arena-meta.c
@@ -0,0 +1,156 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  We have a special "mini" allocator just for allocation of meta-data like
+  the heap (`mi_heap_t`) or thread-local data (`mi_tld_t`).
+
+  We reuse the bitmap of the arena's for allocation of 64b blocks inside
+  an arena slice (64KiB).
+  We always ensure that meta data is zero'd (we zero on `free`)
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Meta data allocation
+----------------------------------------------------------- */
+
+#define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
+#define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
+
+#define MI_META_BLOCK_SIZE        (64)
+#define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
+#define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
+#define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
+
+typedef struct mi_meta_page_s  {
+  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
+  mi_memid_t                       memid;   // provenance of the meta-page memory itself
+  mi_bitmap_t                      blocks_free;  // a small bitmap with 1 bit per block.
+} mi_meta_page_t;
+
+static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
+
+
+#if MI_DEBUG > 1
+static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
+  mi_meta_page_t* mpage = (mi_meta_page_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN);
+  if (block_idx != NULL) {
+    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
+  }
+  return mpage;
+}
+#endif 
+
+static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
+  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
+}
+
+static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
+  mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
+  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
+  void* p = ((uint8_t*)mpage + (block_idx * MI_META_BLOCK_SIZE));
+  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
+  return p;
+}
+
+// allocate a fresh meta page and add it to the global list.
+static mi_meta_page_t* mi_meta_page_zalloc(void) {
+  // allocate a fresh arena slice
+  mi_memid_t memid;
+  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
+                                                                   true /* commit*/, true /* allow large */,
+                                                                   _mi_arena_id_none(), 0 /* tseq */, &memid );
+  if (mpage == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(mpage, MI_ARENA_SLICE_SIZE);
+  }
+
+  // initialize the page
+  mpage->memid = memid;
+  mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE);
+  mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks);
+
+  // push atomically in front of the meta page list
+  // (note: there is no ABA issue since we never free meta-pages)
+  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  do {
+    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
+  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
+  return mpage;
+}
+
+
+// allocate meta-data
+void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+{
+  mi_assert_internal(pmemid != NULL);
+  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
+  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
+  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+    else {
+      mpage = mi_meta_page_next(mpage);
+    }
+  }
+  // failed to find space in existing pages
+  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
+    // the page list was updated by another thread in the meantime, retry
+    return _mi_meta_zalloc(size,pmemid);
+  }
+  // otherwise, allocate a fresh metapage and try once more
+  mpage = mi_meta_page_zalloc();
+  if (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+  }
+  // if all this failed, allocate from the OS
+  return _mi_os_alloc(size, pmemid);
+}
+
+// free meta-data
+void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (memid.memkind == MI_MEM_META) {
+    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
+    const size_t block_count = memid.mem.meta.block_count;
+    const size_t block_idx   = memid.mem.meta.block_index;
+    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
+    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
+    mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
+    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
+    mi_bitmap_clearN(&mpage->blocks_free, block_idx, block_count);
+  }
+  else if (mi_memid_is_os(memid)) {
+    _mi_os_free(p, size, memid);    
+  }
+  else {
+    mi_assert_internal(mi_memid_needs_no_free(memid));
+  }
+}
diff --git a/src/arena.c b/src/arena.c
index fa7d53ed..2558165a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -214,7 +214,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       // not fully committed: commit the full range and set the commit bits
       // (this may race and we may double-commit which is fine)
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) {
+      if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
         memid->initially_committed = false;
       }
       else {
@@ -364,14 +364,13 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 static mi_decl_noinline void* mi_arena_try_find_free(
   size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
 {
   mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
   // search arena's
-  const size_t tseq = tld->tseq;
   mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
   {
     void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
@@ -385,14 +384,14 @@ static mi_decl_noinline void* mi_arena_try_find_free(
 static mi_decl_noinline void* mi_arena_try_alloc(
   size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   void* p;
 again:
   // try to find free slices in the arena's
-  p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+  p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
   if (p != NULL) return p;
 
   // did we need a specific arena?
@@ -406,7 +405,7 @@ again:
     if (ok) {
       // and try allocate in there
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
-      p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+      p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
       if (p != NULL) return p;
     }
   }
@@ -423,7 +422,7 @@ again:
 static void* mi_arena_os_alloc_aligned(
   size_t size, size_t alignment, size_t align_offset,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+  mi_arena_id_t req_arena_id, mi_memid_t* memid)
 {
   // if we cannot use OS allocation, return NULL
   if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
@@ -432,10 +431,10 @@ static void* mi_arena_os_alloc_aligned(
   }
 
   if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, &tld->stats);
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid);
   }
   else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, &tld->stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
 }
 
@@ -444,9 +443,9 @@ static void* mi_arena_os_alloc_aligned(
 void* _mi_arena_alloc_aligned(
   size_t size, size_t alignment, size_t align_offset,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL);
   mi_assert_internal(size > 0);
 
   // *memid = _mi_memid_none();
@@ -459,18 +458,18 @@ void* _mi_arena_alloc_aligned(
       alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
     const size_t slice_count = mi_slice_count_of_size(size);
-    void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+    void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
     if (p != NULL) return p;
   }
 
   // fall back to the OS
-  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid);
   return p;
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, tseq, memid);
 }
 
 
@@ -566,7 +565,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
-    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, &memid, tld);
+    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid);
   }
 
   // otherwise fall back to the OS
@@ -574,10 +573,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     if (os_align) {
       // note: slice_count already includes the page
       mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid);
     }
     else {
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid);
     }
   }
 
@@ -725,7 +724,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   #endif
 
   _mi_page_map_unregister(page);
-  _mi_arena_free(page, 1, 1, page->memid, NULL);
+  _mi_arena_free(page, 1, 1, page->memid);
 }
 
 /* -----------------------------------------------------------
@@ -831,16 +830,15 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats);
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
+static void mi_arenas_try_purge(bool force, bool visit_all);
 
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) {
   mi_assert_internal(size > 0);
   mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
-  if (stats==NULL) { stats = &_mi_stats_main;  }
 
   // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
   mi_track_mem_undefined(p, size);
@@ -851,7 +849,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
       _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
     }
-    _mi_os_free(p, size, memid, stats);
+    _mi_os_free(p, size, memid);
   }
   else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
@@ -894,7 +892,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       }
       */
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, slice_index, slice_count, stats);
+      mi_arena_schedule_purge(arena, slice_index, slice_count);
     }
 
     // and make it available to others again
@@ -904,13 +902,16 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       return;
     };
   }
+  else if (memid.memkind == MI_MEM_META) {
+    _mi_meta_free(p, size, memid);
+  }
   else {
     // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
+    mi_assert_internal(mi_memid_needs_no_free(memid));
   }
 
   // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
+  mi_arenas_try_purge(false, false);
 }
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
@@ -924,7 +925,7 @@ static void mi_arenas_unsafe_destroy(void) {
       mi_lock_done(&arena->abandoned_visit_lock);
       if (mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
+        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
       }
     }
   }
@@ -935,15 +936,15 @@ static void mi_arenas_unsafe_destroy(void) {
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+void _mi_arenas_collect(bool force_purge) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
 }
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+void _mi_arena_unsafe_destroy_all(void) {
   mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
@@ -1036,7 +1037,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   // commit & zero if needed
   bool is_zero = memid.initially_zero;
   if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL, &_mi_stats_main);
+    _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL);
   }
   if (!is_zero) {
     _mi_memzero(arena, mi_size_of_slices(info_slices));
@@ -1096,11 +1097,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
   mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
   if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
@@ -1219,7 +1220,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
   if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }
   return 0;
@@ -1281,14 +1282,14 @@ static long mi_arena_purge_delay(void) {
 
 // reset or decommit in an arena and update the committed/decommit bitmaps
 // assumes we own the area (i.e. slices_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) {
+static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices) {
   mi_assert_internal(!arena->memid.is_pinned);
   const size_t size = mi_size_of_slices(slices);
   void* const p = mi_arena_slice_start(arena, slice_index);
   bool needs_recommit;
   if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) {
     // all slices are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
+    needs_recommit = _mi_os_purge(p, size);
   }
   else {
     // some slices are not committed -- this can happen when a partially committed slice is freed
@@ -1296,7 +1297,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
     // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */);
     if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
   }
 
@@ -1312,13 +1313,13 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices,
 
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) {
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices) {
   const long delay = mi_arena_purge_delay();
   if (delay < 0) return;  // is purging allowed at all?
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, slice_index, slices, stats);
+    mi_arena_purge(arena, slice_index, slices);
   }
   else {
     // schedule decommit
@@ -1327,14 +1328,13 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 }
 
 
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
+static void mi_arenas_try_purge(bool force, bool visit_all) {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if (max_arena == 0) return;
 
   // _mi_error_message(EFAULT, "purging not yet implemented\n");
-  MI_UNUSED(stats);
   MI_UNUSED(visit_all);
   MI_UNUSED(force);
 }
@@ -1385,7 +1385,7 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
   if (p != NULL) return p;
 
   // or fall back to the OS
-  p = _mi_os_alloc(size, memid, &_mi_stats_main);
+  p = _mi_os_alloc(size, memid);
   if (p == NULL) return NULL;
 
   // zero the OS memory if needed
@@ -1398,7 +1398,7 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
 
 void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
   if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid, &_mi_stats_main);
+    _mi_os_free(p, size, memid);
   }
   else {
     mi_assert(memid.memkind == MI_MEM_STATIC);
diff --git a/src/heap.c b/src/heap.c
index 3bf8b976..d2914361 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -119,36 +119,31 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_deferred_free(heap, force);
 
   // python/cpython#112532: we may be called from a thread that is not the owner of the heap
-  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
+  // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
 
   // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  if (
-  #ifdef NDEBUG
-      collect == MI_FORCE
-  #else
-      collect >= MI_FORCE
-  #endif
-    && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim)
-  {
-    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-    // if all memory is freed by now, all segments should be freed.
-    // note: this only collects in the current subprocess
-    _mi_arena_reclaim_all_abandoned(heap);
-  }
+  //if (
+  //#ifdef NDEBUG
+  //    collect == MI_FORCE
+  //#else
+  //    collect >= MI_FORCE
+  //#endif
+  //  && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim)
+  //{
+  //  // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
+  //  // if all memory is freed by now, all segments should be freed.
+  //  // note: this only collects in the current subprocess
+  //  _mi_arena_reclaim_all_abandoned(heap);
+  //}
 
   // collect retired pages
   _mi_heap_collect_retired(heap, force);
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-
-  // if forced, collect thread data cache on program-exit (or shared library unload)
-  if (force && is_main_thread && mi_heap_is_backing(heap)) {
-    _mi_thread_data_collect();  // collect thread data cache
-  }
-
+  
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -187,24 +182,25 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;
+  heap->tld = _mi_tld();
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
   heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->tag        = tag;
-  if (tld->is_in_threadpool) {
+  if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
     // (but abandoning is good in this case)
     heap->allow_page_reclaim = false;
   }
-  if (heap == tld->heap_backing) {
+  if (heap->tld->heap_backing == NULL) {
+    heap->tld->heap_backing = heap;  // first heap becomes the backing heap
     _mi_random_init(&heap->random);
   }
   else {
-    _mi_random_split(&tld->heap_backing->random, &heap->random);
+    _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
   }
   heap->cookie  = _mi_heap_random_next(heap) | 1;
   heap->keys[0] = _mi_heap_random_next(heap);
@@ -220,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap == NULL) return NULL;
   mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
+  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
   return heap;
 }
 
diff --git a/src/init.c b/src/init.c
index b66efc69..f09821b4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -96,6 +96,8 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
+#define MI_MEMID_STATIC  {{{0}},true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
+
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
   // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free
@@ -107,6 +109,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
+  MI_MEMID_STATIC,  // memid
   false,            // can reclaim
   true,             // can eager abandon
   0,                // tag
@@ -135,9 +138,9 @@ static mi_decl_cache_align mi_tld_t tld_main = {
   &_mi_heap_main, &_mi_heap_main,
   &mi_subproc_default,    // subproc
   0,                      // tseq
+  MI_MEMID_STATIC,        // memid
   false,                  // recurse
   false,                  // is_in_threadpool
-  { 0, &tld_main.stats }, // os
   { MI_STATS_NULL }       // stats
 };
 
@@ -152,6 +155,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
+  MI_MEMID_STATIC,  // memid
   true,             // allow page reclaim
   true,             // allow page abandon
   0,                // tag
@@ -230,6 +234,47 @@ mi_heap_t* _mi_heap_main_get(void) {
 }
 
 
+/* -----------------------------------------------------------
+  Thread local data
+----------------------------------------------------------- */
+
+// Thread sequence number
+static _Atomic(size_t) mi_tcount;
+
+// The mimalloc thread local data 
+mi_decl_thread mi_tld_t* mi_tld;
+
+// Allocate fresh tld
+static mi_tld_t* mi_tld_alloc(void) {
+  if (_mi_is_main_thread()) {
+    return &tld_main;
+  }
+  else {
+    mi_memid_t memid;
+    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
+    if (tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
+      return NULL;
+    }
+    tld->memid = memid;
+    tld->heap_backing = NULL;
+    tld->heaps = NULL;
+    tld->subproc = &mi_subproc_default;
+    tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
+    return tld;
+  }
+}
+
+mi_tld_t* _mi_tld(void) {
+  if (mi_tld==NULL) {
+    mi_tld = mi_tld_alloc();
+  }
+  return mi_tld;
+}
+
+
+
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
@@ -239,11 +284,11 @@ mi_subproc_id_t mi_subproc_main(void) {
 }
 
 mi_subproc_id_t mi_subproc_new(void) {
-  mi_memid_t memid = _mi_memid_none();
-  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  mi_memid_t memid;
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
   if (subproc == NULL) return NULL;
-  subproc->memid = memid;
   subproc->abandoned_os_list = NULL;
+  subproc->memid = memid;
   mi_lock_init(&subproc->abandoned_os_lock);
   mi_lock_init(&subproc->abandoned_os_visit_lock);
   return subproc;
@@ -269,7 +314,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   // todo: should we refcount subprocesses?
   mi_lock_done(&subproc->abandoned_os_lock);
   mi_lock_done(&subproc->abandoned_os_visit_lock);
-  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+  _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
@@ -281,94 +326,10 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
 }
 
 
-
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Allocate heap data
 ----------------------------------------------------------- */
 
-// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
-typedef struct mi_thread_data_s {
-  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-  mi_memid_t memid;  // must come last due to zero'ing
-} mi_thread_data_t;
-
-
-// Thread meta-data is allocated directly from the OS. For
-// some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead
-// per thread so we maintain a small cache of recently freed metadata.
-
-#define TD_CACHE_SIZE (32)
-static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
-
-static mi_thread_data_t* mi_thread_data_zalloc(void) {
-  // try to find thread metadata in the cache
-  bool is_zero = false;
-  mi_thread_data_t* td = NULL;
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      // found cached allocation, try use it
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        break;
-      }
-    }
-  }
-
-  // if that fails, allocate as meta data
-  if (td == NULL) {
-    mi_memid_t memid;
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      }
-    }
-    if (td != NULL) {
-      td->memid = memid;
-      is_zero = memid.initially_zero;
-    }
-  }
-
-  if (td != NULL && !is_zero) {
-    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
-  }
-  return td;
-}
-
-static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
-  // try to add the thread metadata to the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td == NULL) {
-      mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
-        return;
-      }
-    }
-  }
-  // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main);
-}
-
-void _mi_thread_data_collect(void) {
-  // free all thread metadata from the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main);
-      }
-    }
-  }
-}
-
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_thread_heap_init(void) {
   if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
@@ -380,32 +341,21 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_zalloc();
-    if (td == NULL) return false;
-
-    mi_tld_t*  tld = &td->tld;
-    mi_heap_t* heap = &td->heap;
-    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
-    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    // allocate heap and thread local data
+    mi_tld_t* tld = _mi_tld();  // allocates & initializes tld if needed
+    mi_memid_t memid;
+    mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid));
+    if (heap==NULL || tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
+      return false;
+    }
+    heap->memid = memid;
+    _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
     _mi_heap_set_default_direct(heap);
   }
   return false;
 }
 
-// Thread sequence number
-static _Atomic(size_t) mi_tcount;
-
-// initialize thread local data
-void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
-  tld->heap_backing = bheap;
-  tld->heaps = NULL;
-  tld->subproc = &mi_subproc_default;
-  tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
-  tld->os.stats = &tld->stats;
-  tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
-}
 
 // Free the thread local default heap (called from `mi_thread_done`)
 static bool _mi_thread_heap_done(mi_heap_t* heap) {
@@ -441,7 +391,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 
   // free if not the main thread
   if (heap != &_mi_heap_main) {
-    mi_thread_data_free((mi_thread_data_t*)heap);
+    _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
   }
   else {
     #if 0
@@ -533,7 +483,13 @@ void _mi_thread_done(mi_heap_t* heap)
   if (heap->thread_id != _mi_thread_id()) return;
 
   // abandon the thread local heap
-  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
+  _mi_thread_heap_done(heap);  // returns true if already ran
+
+  // free thread local data
+  if (mi_tld != NULL) {
+    _mi_meta_free(mi_tld, sizeof(mi_tld_t), mi_tld->memid);
+    mi_tld = NULL;
+  }
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@@ -689,7 +645,7 @@ void mi_cdecl _mi_process_done(void) {
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
     mi_collect(true /* force */);
     _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats);
+    _mi_arena_unsafe_destroy_all();
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
diff --git a/src/os.c b/src/os.c
index 0c020302..b913fb1c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -9,6 +9,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
+// always use main stats for OS calls
+#define os_stats   (&_mi_stats_main)
 
 /* -----------------------------------------------------------
   Initialization.
@@ -89,8 +91,8 @@ void _mi_os_init(void) {
 /* -----------------------------------------------------------
   Util
 -------------------------------------------------------------- */
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+bool _mi_os_decommit(void* addr, size_t size);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
@@ -102,11 +104,9 @@ void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   Free memory
 -------------------------------------------------------------- */
 
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
+static void mi_os_free_huge_os_pages(void* p, size_t size);
 
-static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+static void mi_os_prim_free(void* addr, size_t size, bool still_committed) {
   mi_assert_internal((size % _mi_os_page_size()) == 0);
   if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
   int err = _mi_prim_free(addr, size);
@@ -114,13 +114,12 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
   if (still_committed) {
-    _mi_stat_decrease(&stats->committed, size);
+    _mi_stat_decrease(&os_stats->committed, size);
   }
-  _mi_stat_decrease(&stats->reserved, size);
+  _mi_stat_decrease(&os_stats->reserved, size);
 }
 
-void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) {
-  if (stats == NULL) stats = &_mi_stats_main;
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
   if (mi_memkind_is_os(memid.memkind)) {
     size_t csize = memid.mem.os.size;
     if (csize==0) { _mi_os_good_alloc_size(size); }
@@ -135,10 +134,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     // free it
     if (memid.memkind == MI_MEM_OS_HUGE) {
       mi_assert(memid.is_pinned);
-      mi_os_free_huge_os_pages(base, csize, stats);
+      mi_os_free_huge_os_pages(base, csize);
     }
     else {
-      mi_os_prim_free(base, csize, still_committed, stats);
+      mi_os_prim_free(base, csize, still_committed);
     }
   }
   else {
@@ -147,9 +146,8 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
   }
 }
 
-void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) {
-  if (stats == NULL) stats = &_mi_stats_main;
-  _mi_os_free_ex(p, size, true, memid, stats);
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid) {
+  _mi_os_free_ex(p, size, true, memid);
 }
 
 
@@ -159,7 +157,7 @@ void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) {
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
 // Also `hint_addr` is a hint and may be ignored.
-static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
+static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_zero != NULL);
   mi_assert_internal(is_large != NULL);
@@ -173,13 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  _mi_stat_counter_increase(&stats->mmap_calls, 1);
+  _mi_stat_counter_increase(&os_stats->mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&stats->reserved, size);
+    _mi_stat_increase(&os_stats->reserved, size);
     if (commit) {
-      _mi_stat_increase(&stats->committed, size);
+      _mi_stat_increase(&os_stats->committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -190,14 +186,14 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
   return p;
 }
 
-static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
-  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero, tld_stats);
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
+  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero);
 }
 
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_large != NULL);
@@ -213,7 +209,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
   void* p = NULL;
   if (try_direct_alloc) {
-    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
   }
 
   // aligned already?
@@ -227,13 +223,13 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
     }
     #endif
-    if (p != NULL) { mi_os_prim_free(p, size, commit, stats); }
+    if (p != NULL) { mi_os_prim_free(p, size, commit); }
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
     if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
       // over-allocate uncommitted (virtual) memory
-      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero);
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
@@ -244,12 +240,12 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 
       // explicitly commit only the aligned part
       if (commit) {
-        _mi_os_commit(p, size, NULL, stats);
+        _mi_os_commit(p, size, NULL);
       }
     }
     else  { // mmap can free inside an allocation
       // overallocate...
-      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero);
       if (p == NULL) return NULL;
 
       // and selectively unmap parts around the over-allocated area.
@@ -258,8 +254,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       size_t mid_size = _mi_align_up(size, _mi_os_page_size());
       size_t post_size = over_size - pre_size - mid_size;
       mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
-      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
-      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit); }
       // we can return the aligned pointer on `mmap` systems
       p = aligned_p;
       *base = aligned_p; // since we freed the pre part, `*base == p`.
@@ -275,33 +271,31 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
-  if (stats == NULL) stats = &_mi_stats_main;
   size = _mi_os_good_alloc_size(size);
   bool os_is_large = false;
   bool os_is_zero  = false;
-  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
   if (p != NULL) {
     *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
   }
   return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid)
 {
   MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
-  if (stats == NULL) stats = &_mi_stats_main;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
 
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base);
   if (p != NULL) {
     *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
@@ -311,9 +305,8 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   return p;
 }
 
-void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
-  MI_UNUSED(stats);
-  void* p = _mi_os_alloc(size, memid, &_mi_stats_main);
+void* _mi_os_zalloc(size_t size, mi_memid_t* memid) {
+  void* p = _mi_os_alloc(size, memid);
   if (p == NULL) return NULL;
 
   // zero the OS memory if needed
@@ -332,27 +325,26 @@ void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   to use the actual start of the memory region.
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
-  if (stats == NULL) stats = &_mi_stats_main;
   if (offset == 0) {
     // regular aligned allocation
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
     const size_t oversize = size + extra;
-    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats);
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid);
     if (start == NULL) return NULL;
 
     void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
     if (commit && extra > _mi_os_page_size()) {
-      _mi_os_decommit(start, extra, stats);
+      _mi_os_decommit(start, extra);
     }
     return p;
   }
@@ -386,12 +378,10 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&stats->commit_calls, 1);
+  _mi_stat_increase(&os_stats->committed, size);  // use size for precise commit vs. decommit
+  _mi_stat_counter_increase(&os_stats->commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -417,11 +407,9 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   return true;
 }
 
-static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&stats->committed, size);
+  _mi_stat_decrease(&os_stats->committed, size);
 
   // page align
   size_t csize;
@@ -438,9 +426,9 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   return (err == 0);
 }
 
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+bool _mi_os_decommit(void* addr, size_t size) {
   bool needs_recommit;
-  return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats);
+  return mi_os_decommit_ex(addr, size, &needs_recommit);
 }
 
 
@@ -448,13 +436,13 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&stats->reset, csize);
-  _mi_stat_counter_increase(&stats->reset_calls, 1);
+  _mi_stat_increase(&os_stats->reset, csize);
+  _mi_stat_counter_increase(&os_stats->reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -470,22 +458,22 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&stats->purge_calls, 1);
-  _mi_stat_increase(&stats->purged, size);
+  _mi_stat_counter_increase(&os_stats->purge_calls, 1);
+  _mi_stat_increase(&os_stats->purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
     !_mi_preloading())                                     // don't decommit during preloading (unsafe)
   {
     bool needs_recommit = true;
-    mi_os_decommit_ex(p, size, &needs_recommit, stats);
+    mi_os_decommit_ex(p, size, &needs_recommit);
     return needs_recommit;
   }
   else {
     if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
-      _mi_os_reset(p, size, stats);
+      _mi_os_reset(p, size);
     }
     return false;  // needs no recommit
   }
@@ -493,8 +481,8 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
-  return _mi_os_purge_ex(p, size, true, stats);
+bool _mi_os_purge(void* p, size_t size) {
+  return _mi_os_purge_ex(p, size, true);
 }
 
 
@@ -601,15 +589,15 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       // no success, issue a warning and break
       if (p != NULL) {
         _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
-        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true);
       }
       break;
     }
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
@@ -643,11 +631,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true);
     size -= MI_HUGE_OS_PAGE_SIZE;
     base += MI_HUGE_OS_PAGE_SIZE;
   }
@@ -676,8 +664,7 @@ size_t _mi_os_numa_node_count_get(void) {
   return count;
 }
 
-int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  MI_UNUSED(tld);
+int _mi_os_numa_node_get(void) {
   size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
diff --git a/src/page-map.c b/src/page-map.c
index 7a00d172..5c712346 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -29,7 +29,7 @@ bool _mi_page_map_init(void) {
   // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
   mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
-  _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
+  _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
     return false;
@@ -41,7 +41,7 @@ bool _mi_page_map_init(void) {
   // commit the first part so NULL pointers get resolved without an access violation
   if (!mi_page_map_all_committed) {
     bool is_zero;
-    _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL);
+    _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero);
     if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); }
   }
   _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
@@ -60,7 +60,7 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
         bool is_zero;
         uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
         const size_t   size = mi_page_map_entries_per_commit_bit;
-        _mi_os_commit(start, size, &is_zero, NULL);
+        _mi_os_commit(start, size, &is_zero);
         if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); }
         mi_bitmap_set(&mi_page_map_commit, i);
       }
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 80522f47..e06b278d 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -17,6 +17,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Dynamically bind Windows API points for portability
 //---------------------------------------------
 
+#if defined(_MSC_VER)
+#pragma warning(disable:28159)  // don't use GetVersion
+#pragma warning(disable:4996)   // don't use GetVersion
+#endif
+
 static DWORD win_major_version = 6;
 static DWORD win_minor_version = 0;
 
@@ -126,8 +131,8 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   SYSTEM_INFO si;
   GetSystemInfo(&si);
   if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { 
-    config->alloc_granularity = si.dwAllocationGranularity; 
+  if (si.dwAllocationGranularity > 0) {
+    config->alloc_granularity = si.dwAllocationGranularity;
     win_allocation_granularity = si.dwAllocationGranularity;
   }
   // get virtual address bits
@@ -141,7 +146,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
       config->physical_memory = (size_t)(memInKiB * MI_KiB);
     }
-  }  
+  }
   // get the VirtualAlloc2 function
   HINSTANCE  hDll;
   hDll = LoadLibrary(TEXT("kernelbase.dll"));
@@ -818,14 +823,13 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   }
 #endif
 
-
 bool _mi_prim_thread_is_in_threadpool(void) {
   #if (MI_ARCH_X64 || MI_ARCH_X86)
   if (win_major_version >= 6) {
     // check if this thread belongs to a windows threadpool
     // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
     _TEB* const teb = NtCurrentTeb();
-    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); 
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
     return (pool_data != NULL);
   }
   #endif
diff --git a/src/static.c b/src/static.c
index 0a8fa447..dd874f16 100644
--- a/src/static.c
+++ b/src/static.c
@@ -24,6 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
+#include "arena-meta.c"
 #include "bitmap.c"
 #include "heap.c"
 #include "init.c"

From 2a4af6f169087a60d769ffa61c192961960f5e11 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 17:21:17 -0800
Subject: [PATCH 050/264] comments

---
 src/init.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/init.c b/src/init.c
index f09821b4..7e3e5f86 100644
--- a/src/init.c
+++ b/src/init.c
@@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-#define MI_MEMID_STATIC  {{{0}},true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
+#define MI_MEMID_STATIC  {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
@@ -250,6 +250,9 @@ static mi_tld_t* mi_tld_alloc(void) {
     return &tld_main;
   }
   else {
+    // allocate tld meta-data
+    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
+    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
     mi_memid_t memid;
     mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
     if (tld==NULL) {
@@ -414,7 +417,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //

From bf2f2a8bf42397dc1fba1a9e353628013740661a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 18:48:56 -0800
Subject: [PATCH 051/264] fix bug where only the first chunkmap field would be
 considered

---
 src/bitmap.c       | 2 +-
 src/options.c      | 2 +-
 test/test-stress.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 44113429..45a82ba3 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1028,7 +1028,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
   const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
-  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
+  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \
   const size_t chunkmap_hi_bfield  = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
   const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
   const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
diff --git a/src/options.c b/src/options.c
index f2e9297f..e47f1b6e 100644
--- a/src/options.c
+++ b/src/options.c
@@ -68,7 +68,7 @@ typedef struct mi_option_desc_s {
 // in KiB
 #ifndef MI_DEFAULT_ARENA_RESERVE
  #if (MI_INTPTR_SIZE>4)
-  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
+  #define MI_DEFAULT_ARENA_RESERVE 8*1024L*1024L
  #else
   #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
  #endif
diff --git a/test/test-stress.c b/test/test-stress.c
index 96cf702d..915c953f 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -57,7 +57,7 @@ static int ITER = 10;
 #define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
-static int SCALE   = 50;      // scaling factor
+static int SCALE   = 25;      // scaling factor
 static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 #endif  
 

From 68ac94c1baccbeac37a8dd75dddb34542b08e8f0 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 8 Dec 2024 18:53:43 -0800
Subject: [PATCH 052/264] set default arena reserve back to 1GiB

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index e47f1b6e..f2e9297f 100644
--- a/src/options.c
+++ b/src/options.c
@@ -68,7 +68,7 @@ typedef struct mi_option_desc_s {
 // in KiB
 #ifndef MI_DEFAULT_ARENA_RESERVE
  #if (MI_INTPTR_SIZE>4)
-  #define MI_DEFAULT_ARENA_RESERVE 8*1024L*1024L
+  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
  #else
   #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
  #endif

From d5ed0cc71ef02b5ab986fa7ffc06b4c6e65dd622 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 14:31:43 -0800
Subject: [PATCH 053/264] various improvements

---
 include/mimalloc/atomic.h |   3 +
 include/mimalloc/bits.h   |  15 ++-
 include/mimalloc/types.h  |   6 +-
 src/arena.c               |  52 ++++++---
 src/bitmap.c              | 238 +++++++++++++++++++++-----------------
 src/bitmap.h              |  20 +++-
 src/free.c                |   7 +-
 src/init.c                |   2 +-
 src/os.c                  |  13 +--
 src/random.c              |  19 ++-
 10 files changed, 223 insertions(+), 152 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index caa90cf8..3b0ff559 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 3afac04d..e47d8a76 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
     unsigned long i;
     return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
-    return (x!=0 ? (*idx = mi_ctz(x), true) : false);    
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
   #endif
 }
 
@@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) {
   #endif
 }
 
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
 
 #endif // MI_BITS_H
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d507ca69..71edb397 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -334,9 +334,9 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // ~16KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // ~128KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // ~2MiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index ab74b988..24835f42 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 ----------------------------------------------------------- */
 
 #define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
-
+#define MI_ARENA_MIN_SIZE       (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE)           // 32 MiB (or 8 MiB on 32-bit)
+#define MI_ARENA_MAX_SIZE       (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
@@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) {
 
 mi_arena_t* mi_arena_from_index(size_t idx) {
   mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
 }
 
 mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
@@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
         }
       }
     }
+    if (memid->initially_zero) {
+      mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
+    else {
+      mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
-  if (_mi_preloading()) return false;  // use OS only while pre loading
+  // if (_mi_preloading()) return false;  // use OS only while pre loading
   if (req_arena_id != _mi_arena_id_none()) return false;
 
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
@@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
   if (arena_count >= 1 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    // scale up the arena sizes exponentially every 4 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
@@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps?
-  const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE;  // 16 GiB
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;   
+  const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
   }
@@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+  // and try to reserve the arena
+  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); 
+  if (err != 0) {
+    // failed, try a smaller size?
+    const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
+    if (arena_reserve > small_arena_reserve) {
+      // try again
+      err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+    }
+  }
+  return (err==0);
 }
 
 
@@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 
 #define mi_forall_arenas(req_arena_id, tseq, name_arena) \
   { \
-  const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
+  const size_t _arena_count = mi_arena_get_count(); \
   if (_arena_count > 0) { \
     const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
     size_t _start; \
     if (req_arena_id == _mi_arena_id_none()) { \
-       /* always start searching in an arena 1 below the max */ \
+       /* always start searching in the arena's below the max */ \
       _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
     } \
     else { \
@@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
       size_t _idx; \
       if (_i < _arena_cycle) { \
         _idx = _i + _start; \
-        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
       } \
       else { \
-        _idx = _i; \
+        _idx = _i; /* remaining arena's */ \
       } \
       mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
       if (name_arena != NULL) \
@@ -397,6 +414,9 @@ again:
   // did we need a specific arena?
   if (req_arena_id != _mi_arena_id_none()) return NULL;
 
+  // don't create arena's while preloading (todo: or should we?)
+  if (_mi_preloading()) return NULL;
+
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
   if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
     mi_arena_id_t arena_id = 0;
@@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   size_t new_max_arena = 0;
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
@@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) {
 
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
@@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
 
 void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
   MI_UNUSED(show_abandoned);
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t max_arenas = mi_arena_get_count();
   size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
@@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all) {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   if (max_arena == 0) return;
 
   // _mi_error_message(EFAULT, "purging not yet implemented\n");
diff --git a/src/bitmap.c b/src/bitmap.c
index 45a82ba3..2f563066 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 
+#define MI_USE_SIMD   0
+
 /* --------------------------------------------------------------------------------
   bfields
 -------------------------------------------------------------------------------- */
@@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
-static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-  return mi_rotr(x,r);
-}
+//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+//  return mi_rotr(x,r);
+//}
 
 static inline mi_bfield_t mi_bfield_zero(void) {
   return 0;
@@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
 
 // ------- mi_bchunk_try_find_and_clear ---------------------------------------
 
-#if defined(__AVX2__)
+#if MI_USE_SIMD && defined(__AVX2__) 
 static inline __m256i mi_mm256_zero(void) {
   return _mm256_setzero_si256();
 }
@@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 }
 #endif
 
+static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
+  mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  size_t cidx;
+  if (!allow_allset && (~b == 0)) return false;
+  if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
+    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
+
 // Find least 1-bit in a chunk and try to clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
 static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
@@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;    
     // try again
   }
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     size_t chunk_idx = 0;
     #if 0
@@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     chunk_idx = _tzcnt_u64(mask) / 8;
     #endif
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the bit-idx that is clear
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
     // try again
   }
   #else
+  // try first to find a field that is not all set (to reduce fragmentation)
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
-    size_t idx;
-    if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) {  // try to clear it atomically
-        *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
+  }
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
   }
   return false;
   #endif
 }
 
 
+static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  if (!allow_all_set && (~b == 0)) return false;
+  // has_set8 has low bit in each byte set if the byte in x == 0xFF
+  const mi_bfield_t has_set8 = 
+    ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+     (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+     >> 7;                           // shift high bit to low bit
+  size_t idx;
+  if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
+    mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+    mi_assert_internal((idx%8)==0);
+    const size_t byte_idx = idx/8;
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // unset the byte atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     // try again
   }
   #else
+    // first skip allset fields to reduce fragmentation
     for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
-      // has_set8 has low bit in each byte set if the byte in x == 0xFF
-      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
-                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
-                                    >> 7;                           // shift high bit to low bit
-      size_t idx;
-      if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
-        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-        mi_assert_internal((idx%8)==0);
-        const size_t byte_idx = idx/8;
-        if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) {  // unset the byte atomically
-          *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
-          return true;
-        }
-        // else continue
-      }
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
+    }
+    for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
     }
     return false;
   #endif
@@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
 // Used to find large size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
 }
 
 
-static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
-  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
-  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
-  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
-}
+//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+//  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
+//  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
+//  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
+//  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+//  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+//  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
+//}
 
 
 // ------- mi_bchunk_clear_once_set ---------------------------------------
@@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
 
 // are all bits in a bitmap chunk clear?
 static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
   return mi_mm256_is_zero(vec);
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   // a 64b cache-line contains the entire chunk anyway so load both at once
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
@@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
  bitmap chunkmap
 -------------------------------------------------------------------------------- */
 
+static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
+  if mi_unlikely(chunk_idx > oldmax) {
+    mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
+  }
+}
+
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
   mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
+  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
 }
 
 static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
-  // record the max clear
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
-  do {
-    if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
+  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
   return true;
 }
 
@@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
     mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
     mi_bitmap_chunkmap_set(bitmap, chunk_idx);
   }
+
+  // reset max_accessed
+  mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
 }
 
 
@@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
-  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \
-  const size_t chunkmap_hi_bfield  = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
-  const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
-  const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
+  const size_t chunk_max_acc       = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
+  const size_t chunk_start         = tseq % chunk_max_acc; /* space out threads? */ \
+  const size_t chunkmap_max        = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
+  const size_t chunkmap_max_acc    = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
+  const size_t chunkmap_start      = chunk_start / MI_BFIELD_BITS; \
   /* for each chunkmap entry `i` */ \
-  for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
+  for (size_t _i = 0; _i < chunkmap_max; _i++) { \
     size_t i; \
-    if (_i < chunkmap_hi_bfield) { \
-      i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \
-      if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \
+    if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
+      i = _i + chunkmap_start;  \
+      if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
     } \
-    else { i = _i;  }  /* the rest of the chunks above chunk_hi_idx */ \
+    else { i = _i;  }  /* the rest of the chunks above chunk_max_accessed */ \
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
-    size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0 && chunkmap_start_idx > 0) { \
-      cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
-      cmap_idx_shift = chunkmap_start_idx; \
-    } \
+    /* todo: space out threads within a chunkmap (2GiB) as well? */ \
+    size_t cmap_idx_shift = 0;   /* shift through the cmap */ \
     size_t cmap_idx; \
     while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
       /* set the chunk idx */ \
       size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
-      mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
       /* try to find and clear N bits in that chunk */ \
       {
 
@@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
     } \
   }}
 
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
-{
-  // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
-  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-      *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
-      return true;
-    }
-    else {
-      // we may find that all are cleared only on a second iteration but that is ok as
-      // the chunkmap is a conservative approximation.
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-    }
-  }
-  mi_bitmap_forall_chunks_end();
-  return false;
+
+#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
+  mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
+    size_t _cidx; \
+    if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
+      *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
+      return true; \
+    } \
+    else { \
+      /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
+      mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
+    } \
+  } \
+  mi_bitmap_forall_chunks_end(); \
+  return false; \
+}
+
+#define COMMA ,
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
 }
 
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 40c4df42..b26791cc 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)  chunk_count;      // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
   size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
   mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
@@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 
+// Specialized versions for common bit sequence sizes
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+
 // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
+mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
+  if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
+  return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
+}
 
 
 // Called once a bit is cleared to see if the memory slice can be claimed.
diff --git a/src/free.c b/src/free.c
index d45507e7..0da0332e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   }
 
   // 2. if the page is not too full, we can try to reclaim it for ourselves
+  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
   if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
-      !mi_page_is_used_at_frac(page,8))
+      !mi_page_is_used_at_frac(page,4) 
+      // && !mi_page_is_abandoned_mapped(page)
+     )
   {
     // the page has still some blocks in use (but not too many)
     // reclaim in our heap if compatible, or otherwise abandon again
@@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   }
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!mi_page_is_used_at_frac(page, 4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!mi_page_is_used_at_frac(page,4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
     !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
     _mi_arena_page_try_reabandon_to_mapped(page))
   {
diff --git a/src/init.c b/src/init.c
index 2396f594..2070405d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-#define MI_MEMID_STATIC  {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
+#define MI_MEMID_STATIC  {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
diff --git a/src/os.c b/src/os.c
index b913fb1c..55f7428e 100644
--- a/src/os.c
+++ b/src/os.c
@@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste).
-  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64);
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
 
-  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
   void* p = NULL;
   if (try_direct_alloc) {
     p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
@@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
       *base = p; // remember the base
       p = _mi_align_up_ptr(p, alignment);
 
@@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
   if (newsize != NULL) *newsize = 0;
   if (size == 0 || addr == NULL) return NULL;
 
-  // page align conservatively within the range
+  // page align conservatively within the range, or liberally straddling pages outside the range
   void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
     : mi_align_down_ptr(addr, _mi_os_page_size()));
   void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
@@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
     return needs_recommit;
   }
   else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
       _mi_os_reset(p, size);
     }
     return false;  // needs no recommit
diff --git a/src/random.c b/src/random.c
index 4fc8b2f8..35e2718a 100644
--- a/src/random.c
+++ b/src/random.c
@@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
-#include <string.h>       // memset
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 
-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 
 static void chacha_block(mi_random_ctx_t* ctx)
@@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
   for (size_t i = 0; i < 4; i++) {
     const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
     ctx->input[i] = read32(sigma,i);
@@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
   _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
   ctx_new->input[12] = 0;
   ctx_new->input[13] = 0;
@@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
   for (uintptr_t i = 0; i < max; i++) {

From 351cb0c7407ef95ba152d7a6c5d22b407a76b784 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 9 Dec 2024 15:16:36 -0800
Subject: [PATCH 054/264] small fixes for macOS

---
 CMakeLists.txt              | 20 +++++++++-----------
 include/mimalloc/internal.h |  2 +-
 src/page-map.c              | 10 ++++++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6df4ba5a..553b279d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,20 +330,18 @@ endif()
 
 # Determine architecture
 set(MI_OPT_ARCH_FLAGS "")
-set(MI_ARCH "unknown")
-if(APPLE)
-  list(FIND CMAKE_OSX_ARCHITECTURES "x86_64" x64_index)
-  list(FIND CMAKE_OSX_ARCHITECTURES "arm64" arm64_index)
-  if(x64_index GREATER_EQUAL 0)
-    set(MI_ARCH "x64")
-  elseif(arm64_index GREATER_EQUAL 0)
-    set(MI_ARCH "arm64")
-  endif()
-elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64")
+set(MI_ARCH "")
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR 
+   CMAKE_GENERATOR_PLATFORM STREQUAL "x64")          # msvc
   set(MI_ARCH "x64")
-elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR   
+       CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR    # apple
+       CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")    # msvc
   set(MI_ARCH "arm64")
 endif()
+if(MI_ARCH)
+  message(STATUS "Architecture: ${MI_ARCH}")
+endif()
 
 # Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits.
 # (this will skip the aligned hinting in that case. Issue #939, #949)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 4c8256a0..176c1de8 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -459,7 +459,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
 }
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  #if MI_DEBUG
+  #if MI_DEBUG || defined(__APPLE__)
   return _mi_checked_ptr_page(p);
   #else
   return _mi_ptr_page_ex(p,NULL);
diff --git a/src/page-map.c b/src/page-map.c
index 5c712346..475e8fc2 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -12,6 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
+static void*       mi_page_map_max_address = NULL;
 static mi_memid_t  mi_page_map_memid;
 
 // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
@@ -23,12 +24,13 @@ bool _mi_page_map_init(void) {
   if (vbits >= 48) vbits = 47;
   // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
   //                    64 KiB for 4 GiB address space (on 32-bit)
+  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
 
   mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT);
   // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
-  mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  mi_page_map_all_committed = true; // (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
@@ -118,8 +120,12 @@ void _mi_page_map_unregister(mi_page_t* page) {
 
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  // if mi_unlikely(_mi_page_map==NULL) {  // happens on macOS during loading
+  //   _mi_page_map_init();  
+  // }
+  if mi_unlikely(p >= mi_page_map_max_address) return false;
   uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
-  if (!mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
+  if (mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
     return (_mi_page_map[idx] != 0);
   }
   else {

From 8f5449d2715c66f67a2d3fb2c3f0800ce59ced9a Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 9 Dec 2024 15:39:15 -0800
Subject: [PATCH 055/264] various fixes for test pipeline

---
 src/alloc-aligned.c | 4 ++--
 src/alloc.c         | 7 +++----
 src/free.c          | 2 +-
 src/page.c          | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index b1e6329c..4b142a1e 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -29,7 +29,7 @@ static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, si
   mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
   const size_t oversize = size + alignment - 1;
   void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
-  void* p = mi_align_up_ptr(base, alignment);
+  void* p = _mi_align_up_ptr(base, alignment);
   mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
   mi_assert_internal(mi_usable_size(p) >= size);
   mi_assert_internal(_mi_is_aligned(p, alignment));
@@ -175,7 +175,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   }
 
   #if MI_GUARDED
-  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_heap_malloc_use_guarded(heap,size)) {
     return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
   }
   #endif
diff --git a/src/alloc.c b/src/alloc.c
index 840d34fe..b0c89e65 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -619,7 +619,6 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
   block->next = MI_BLOCK_TAG_GUARDED;
 
   // set guard page at the end of the block
-  mi_segment_t* const segment = _mi_page_segment(page);
   const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
   const size_t os_page_size = _mi_os_page_size();
   mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
@@ -630,7 +629,7 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
   }
   uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
   mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
-  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
     _mi_os_protect(guard_page, os_page_size);
   }
   else {
@@ -640,9 +639,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
   // align pointer just in front of the guard page
   size_t offset = block_size - os_page_size - obj_size;
   mi_assert_internal(offset > sizeof(mi_block_t));
-  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
     // give up to place it right in front of the guard page if the offset is too large for unalignment
-    offset = MI_BLOCK_ALIGNMENT_MAX;
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
   }
   void* p = (uint8_t*)block + offset;  
   mi_track_align(block, p, offset, obj_size);
diff --git a/src/free.c b/src/free.c
index 0da0332e..49bf8bf6 100644
--- a/src/free.c
+++ b/src/free.c
@@ -519,7 +519,7 @@ static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
   const size_t bsize = mi_page_block_size(page);
   const size_t psize = _mi_os_page_size();
   mi_assert_internal(bsize > psize);
-  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  mi_assert_internal(!page->memid.is_pinned);
   void* gpage = (uint8_t*)block + bsize - psize;
   mi_assert_internal(_mi_is_aligned(gpage, psize));
   _mi_os_unprotect(gpage, psize);
diff --git a/src/page.c b/src/page.c
index f21bf91f..98319e53 100644
--- a/src/page.c
+++ b/src/page.c
@@ -756,7 +756,7 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
   if (page != NULL) {
    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
     if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+      mi_page_extend_free(heap, page);
       mi_assert_internal(mi_page_immediate_available(page));
     }
     else

From 3f732a981f8b4a8a7122b2b59f5c1a1b1141c848 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 9 Dec 2024 15:49:20 -0800
Subject: [PATCH 056/264] fix debug build of MI_GUARDED

---
 src/alloc-aligned.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 4b142a1e..38e0371d 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -26,7 +26,7 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
 #if MI_GUARDED
 static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
   // use over allocation for guarded blocksl
-  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t oversize = size + alignment - 1;
   void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
   void* p = _mi_align_up_ptr(base, alignment);

From bbcbd3cd1fee630547542c20f60b51d5eb62a001 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 19:06:06 -0800
Subject: [PATCH 057/264] add cast to avoid errors on clang 7

---
 include/mimalloc/internal.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 176c1de8..c6d9ae36 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -534,7 +534,7 @@ static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
 
 // Thread id of thread that owns this page
 static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
-  return mi_atomic_load_relaxed(&page->xthread_id);
+  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
 }
 
 // Thread free access
@@ -605,11 +605,11 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_atomic_load_relaxed(&page->xthread_id) <= 1);
+  return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) <= 1);
 }
 
 static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
-  return (mi_atomic_load_relaxed(&page->xthread_id) == 1);
+  return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) == 1);
 }
 
 static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
@@ -675,7 +675,7 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 // Page flags
 //-----------------------------------------------------------
 static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
-  return mi_atomic_load_relaxed(&page->xflags);
+  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xflags);
 }
 
 static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {

From f28d5c7029976ce97565fe07ea5382c180c5f361 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 19:12:03 -0800
Subject: [PATCH 058/264] add cast to avoid errors on clang 7

---
 src/bitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.h b/src/bitmap.h
index b26791cc..191b6864 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -100,7 +100,7 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
 
 
 static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
-  return mi_atomic_load_relaxed(&bitmap->chunk_count);
+  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
 }
 
 static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {

From 56a1bd7f9ec5b37f65de8c8500ee5c4a4497d553 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 19:43:00 -0800
Subject: [PATCH 059/264] fix 32 bit multiply in generic ctz/clz

---
 include/mimalloc/bits.h | 4 ++--
 src/libc.c              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index e47d8a76..cb0191cf 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -220,7 +220,7 @@ static inline size_t mi_popcount(size_t x) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsf(size_t x, size_t* idx) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
     // on x64 the carry flag is set on zero which gives better codegen
     bool is_zero;
     __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
@@ -237,7 +237,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsr(size_t x, size_t* idx) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
     // on x64 the carry flag is set on zero which gives better codegen
     bool is_zero;
     __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
diff --git a/src/libc.c b/src/libc.c
index 3fdbf3e7..2b28bd25 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -289,7 +289,7 @@ static size_t mi_ctz_generic32(uint32_t x) {
     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
   };
   if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+  return debruijn[((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
 }
 
 static size_t mi_clz_generic32(uint32_t x) {
@@ -304,7 +304,7 @@ static size_t mi_clz_generic32(uint32_t x) {
   x |= x >> 4;
   x |= x >> 8;
   x |= x >> 16;
-  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
 }
 
 size_t _mi_clz_generic(size_t x) {

From e44815ed6fa19eaf12d9141c1e202d8308dcf113 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 20:06:48 -0800
Subject: [PATCH 060/264] add bsf/bsr for compilation with older compilers
 (clang 7)

---
 include/mimalloc/bits.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index cb0191cf..4f0dce71 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -153,9 +153,9 @@ size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
 
 static inline size_t mi_ctz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  // on x64 tzcnt is defined for 0
-    uint64_t r;
-    __asm ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_tzcnt_u64(x);
@@ -164,6 +164,11 @@ static inline size_t mi_ctz(size_t x) {
     return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
   #elif mi_has_builtinz(ctz)
     return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
   #else
     #define MI_HAS_FAST_BITSCAN  0
     return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
@@ -172,9 +177,9 @@ static inline size_t mi_ctz(size_t x) {
 
 static inline size_t mi_clz(size_t x) {
   #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
-    uint64_t r;
-    __asm ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
-    return r;
+    size_t r;
+    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;    
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_lzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
@@ -182,6 +187,11 @@ static inline size_t mi_clz(size_t x) {
     return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
   #elif mi_has_builtinz(clz)
     return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return (MI_SIZE_BITS - 1 - r);
   #else
     #define MI_HAS_FAST_BITSCAN  0
     return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);

From 3a92c3527045b7922e12131d248b6d57ea646de9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 20:25:22 -0800
Subject: [PATCH 061/264] improve generic ctz/clz

---
 src/libc.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/libc.c b/src/libc.c
index 2b28bd25..15d4d2a7 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -289,7 +289,7 @@ static size_t mi_ctz_generic32(uint32_t x) {
     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
   };
   if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
 }
 
 static size_t mi_clz_generic32(uint32_t x) {
@@ -307,25 +307,33 @@ static size_t mi_clz_generic32(uint32_t x) {
   return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
 }
 
-size_t _mi_clz_generic(size_t x) {
-  if (x==0) return MI_SIZE_BITS;
-  #if (MI_SIZE_BITS <= 32)
-    return mi_clz_generic32((uint32_t)x);
-  #else
-    const size_t count = mi_clz_generic32((uint32_t)(x >> 32));
-    if (count < 32) return count;
-    return (32 + mi_clz_generic32((uint32_t)x));
-  #endif
-}
-
 size_t _mi_ctz_generic(size_t x) {
   if (x==0) return MI_SIZE_BITS;
   #if (MI_SIZE_BITS <= 32)
     return mi_ctz_generic32((uint32_t)x);
   #else
-    const size_t count = mi_ctz_generic32((uint32_t)x);
-    if (count < 32) return count;
-    return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
+}
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
   #endif
 }
 

From c5a2d11193da2335741a6c66fed8d88c6dd53764 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 20:40:26 -0800
Subject: [PATCH 062/264] add extra checks for valid pointers in the pagemap,
 add max_vabits and debug_commit_full_pagemap options

---
 ide/vs2022/mimalloc-override.vcxproj.filters |  4 ++-
 ide/vs2022/mimalloc.vcxproj.filters          |  4 ++-
 include/mimalloc.h                           |  4 ++-
 include/mimalloc/internal.h                  |  1 +
 src/options.c                                |  2 ++
 src/page-map.c                               | 37 +++++++++++---------
 6 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters
index 0e63822c..fb48e98f 100644
--- a/ide/vs2022/mimalloc-override.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override.vcxproj.filters
@@ -58,7 +58,9 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-meta.c" />
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\mimalloc\atomic.h">
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 7fc4ba9c..06b0364f 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,7 +58,9 @@
     <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-meta.c" />
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 907ffadb..c11353b7 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -380,7 +380,9 @@ typedef enum mi_option_e {
   mi_option_target_segments_per_thread, // experimental (=0)
   mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
   mi_option_full_page_retain,           // retain N full pages per size class (=2)
-  mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4) 
+  mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_max_vabits,                 // max virtual address bits to consider in user space (=48)
+  mi_option_debug_commit_full_pagemap,  // commit the full pagemap to catch invalid pointer uses (=0)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c6d9ae36..c189a082 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -459,6 +459,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
 }
 
 static inline mi_page_t* _mi_ptr_page(const void* p) {
+  mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
   #if MI_DEBUG || defined(__APPLE__)
   return _mi_checked_ptr_page(p);
   #else
diff --git a/src/options.c b/src/options.c
index f2e9297f..8fcee452 100644
--- a/src/options.c
+++ b/src/options.c
@@ -160,6 +160,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 2,   UNINIT, MI_OPTION(full_page_retain) },
   { 4,   UNINIT, MI_OPTION(max_page_candidates) },
+  { 0,   UNINIT, MI_OPTION(max_vabits) },
+  { 0,   UNINIT, MI_OPTION(debug_commit_full_pagemap) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page-map.c b/src/page-map.c
index 475e8fc2..181db7f0 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -20,8 +20,11 @@ static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_C
                                           { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
 
 bool _mi_page_map_init(void) {
-  size_t vbits = _mi_os_virtual_address_bits();
-  if (vbits >= 48) vbits = 47;
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);  
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    if (vbits >= 48) { vbits = 47; }
+  }
   // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
   //                    64 KiB for 4 GiB address space (on 32-bit)
   mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
@@ -30,7 +33,7 @@ bool _mi_page_map_init(void) {
   mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT);
   // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
 
-  mi_page_map_all_committed = true; // (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  mi_page_map_all_committed = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
@@ -52,26 +55,28 @@ bool _mi_page_map_init(void) {
 }
 
 static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
-  // is the page map area that contains the page address committed?
-  if (!mi_page_map_all_committed) {
-    const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
-    const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit;
-    for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
-      if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) {
-        // this may race, in which case we do multiple commits (which is ok)
+  // is the page map area that contains the page address committed?  
+  // we always set the commit bits so we can track what ranges are in-use.
+  // we only actually commit if the map wasn't committed fully already.
+  const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
+  const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit;
+  for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+    if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) {
+      // this may race, in which case we do multiple commits (which is ok)
+      if (!mi_page_map_all_committed) {
         bool is_zero;
         uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
         const size_t   size = mi_page_map_entries_per_commit_bit;
         _mi_os_commit(start, size, &is_zero);
-        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); }
-        mi_bitmap_set(&mi_page_map_commit, i);
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }
       }
+      mi_bitmap_set(&mi_page_map_commit, i);
     }
-    #if MI_DEBUG > 0
-    _mi_page_map[idx] = 0;
-    _mi_page_map[idx+slice_count-1] = 0;
-    #endif
   }
+  #if MI_DEBUG > 0
+  _mi_page_map[idx] = 0;
+  _mi_page_map[idx+slice_count-1] = 0;
+  #endif  
 }
 
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {

From 6798375f4734cf9d579f00c8f55313a48616633d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 9 Dec 2024 21:26:23 -0800
Subject: [PATCH 063/264] temporarily add macOS 13 and 12 for testing

---
 azure-pipelines.yml | 48 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index eb520aa0..d853db2f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -305,3 +305,51 @@ jobs:
   - script: ctest --verbose --timeout 180
     workingDirectory: $(BuildType)
     displayName: CTest
+
+- job:
+  displayName: macOS 13 (Ventura)
+  pool:
+    vmImage:
+      macOS-13
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 180
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
+- job:
+  displayName: macOS 12 (Monterey)
+  pool:
+    vmImage:
+      macOS-12
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 180
+    workingDirectory: $(BuildType)
+    displayName: CTest

From f37aff6ee273cb149f3d103598c1c38ab673268d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 9 Dec 2024 22:27:40 -0800
Subject: [PATCH 064/264] fix for macOS 14 and earlier

---
 include/mimalloc/internal.h |  2 +-
 src/heap.c                  |  6 +++---
 src/init.c                  | 17 +++++++++++++----
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c189a082..8a61a58e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -186,7 +186,7 @@ size_t      _mi_bin_size(uint8_t bin);           // for stats
 uint8_t     _mi_bin(size_t size);                // for stats
 
 // "heap.c"
-void        _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
+void        _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld);
 void        _mi_heap_destroy_pages(mi_heap_t* heap);
 void        _mi_heap_collect_abandon(mi_heap_t* heap);
 void        _mi_heap_set_default_direct(mi_heap_t* heap);
diff --git a/src/heap.c b/src/heap.c
index d2914361..ee0a8ce9 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -182,9 +182,9 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) {
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = _mi_tld();
+  heap->tld = (tld == NULL ? _mi_tld() : tld);  // avoid reading the thread-local tld during initialization
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
   heap->allow_page_reclaim = !noreclaim;
@@ -216,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap == NULL) return NULL;
   mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
+  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, NULL);
   return heap;
 }
 
diff --git a/src/init.c b/src/init.c
index 2070405d..19e111d3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -214,7 +214,7 @@ static void mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
     _mi_heap_main.thread_id = _mi_thread_id();
     _mi_heap_main.cookie = 1;
-    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
       _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
     #else
       _mi_random_init(&_mi_heap_main.random);
@@ -344,8 +344,12 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // allocate heap and thread local data
-    mi_tld_t* tld = _mi_tld();  // allocates & initializes tld if needed
+    // allocates tld data 
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation (on macOS <= 14 for 
+    // example where the loader allocates thread-local data on demand).
+    mi_tld_t* tld = mi_tld_alloc();  
+    
+    // allocate and initialize the heap
     mi_memid_t memid;
     mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid));
     if (heap==NULL || tld==NULL) {
@@ -353,8 +357,13 @@ static bool _mi_thread_heap_init(void) {
       return false;
     }
     heap->memid = memid;
-    _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */, tld);
+
+    // associate the heap with this thread
+    // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation)
     _mi_heap_set_default_direct(heap);
+    // now that the heap is set for this thread, we can set the thread-local tld.
+    mi_tld = tld; 
   }
   return false;
 }

From 7cd8f31f30cf77bf957b29af1d6f370b0b935759 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 10 Dec 2024 14:50:55 -0800
Subject: [PATCH 065/264] improve popcount

---
 include/mimalloc/bits.h | 48 +++++++++++++++++++++--------------------
 src/libc.c              |  4 ++--
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 4f0dce71..c0405d6f 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -144,11 +144,31 @@ typedef int32_t  mi_ssize_t;
 #define mi_msc_builtinz(name)    name##64
 #endif
 
-
 /* --------------------------------------------------------------------------------
-  Count trailing/leading zero's
+  Popcount and count trailing/leading zero's
 -------------------------------------------------------------------------------- */
 
+size_t _mi_popcount_generic(size_t x);
+
+static inline size_t mi_popcount(size_t x) {
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
+
 size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
 
@@ -169,6 +189,8 @@ static inline size_t mi_ctz(size_t x) {
     size_t r;
     __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
+  #elif MI_HAS_FAST_POPCOUNT
+    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
   #else
     #define MI_HAS_FAST_BITSCAN  0
     return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
@@ -179,7 +201,7 @@ static inline size_t mi_clz(size_t x) {
   #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     size_t r;
     __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
-    return r;    
+    return r;
   #elif MI_ARCH_X64 && defined(__BMI1__)
     return (size_t)_lzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
@@ -202,26 +224,6 @@ static inline size_t mi_clz(size_t x) {
 #define MI_HAS_FAST_BITSCAN 1
 #endif
 
-size_t _mi_popcount_generic(size_t x);
-
-static inline size_t mi_popcount(size_t x) {
-  #if mi_has_builtinz(popcount)
-    return mi_builtinz(popcount)(x);
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
-    return mi_msc_builtinz(__popcnt)(x);
-  #elif MI_ARCH_X64 && defined(__BMI1__)
-    return (size_t)_mm_popcnt_u64(x);
-  #else
-    #define MI_HAS_FAST_POPCOUNT  0
-    return (x<=1 ? x : _mi_popcount_generic(x));
-  #endif
-}
-
-#ifndef MI_HAS_FAST_POPCOUNT
-#define MI_HAS_FAST_POPCOUNT 1
-#endif
-
-
 /* --------------------------------------------------------------------------------
   find trailing/leading zero  (bit scan forward/reverse)
 -------------------------------------------------------------------------------- */
diff --git a/src/libc.c b/src/libc.c
index 15d4d2a7..eed63d87 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -283,7 +283,7 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
 #if !MI_HAS_FAST_BITSCAN
 
 static size_t mi_ctz_generic32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
   static const uint8_t debruijn[32] = {
     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
@@ -293,7 +293,7 @@ static size_t mi_ctz_generic32(uint32_t x) {
 }
 
 static size_t mi_clz_generic32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
   static const uint8_t debruijn[32] = {
     31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
     23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0

From 13be5d6740f43931342ba0e59364a68e275da47a Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 10 Dec 2024 15:11:46 -0800
Subject: [PATCH 066/264] use non-null tld in heap_init

---
 src/heap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index ee0a8ce9..1b5d14b4 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -184,7 +184,7 @@ mi_heap_t* mi_heap_get_backing(void) {
 
 void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) {
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = (tld == NULL ? _mi_tld() : tld);  // avoid reading the thread-local tld during initialization
+  heap->tld = tld;  // avoid reading the thread-local tld during initialization
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
   heap->allow_page_reclaim = !noreclaim;
@@ -216,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap == NULL) return NULL;
   mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, NULL);
+  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, bheap->tld);
   return heap;
 }
 

From c478ddaab490a5161de2a297e126b2e561c010a2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 10 Dec 2024 19:44:54 -0800
Subject: [PATCH 067/264] fix MI_GUARDED build

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 src/alloc.c                 |  5 ++++-
 src/arena.c                 | 22 +++++++++++++++++++---
 src/init.c                  |  2 +-
 src/libc.c                  |  4 ++--
 test/main-override-static.c |  4 ++--
 test/test-stress.c          |  2 +-
 7 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index d8cc25b1..3f1280ee 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -116,7 +116,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/src/alloc.c b/src/alloc.c
index b0c89e65..25d6f62e 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -628,6 +628,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
     return NULL;
   }
   uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
+  // is ensured in `mi_arena_page_alloc_fresh`.
+  mi_assert_internal(_mi_is_aligned(block, os_page_size));
   mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
   if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
     _mi_os_protect(guard_page, os_page_size);
@@ -662,7 +665,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
   const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
   mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
   if (block==NULL) return NULL;
-  void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);
 
   // stats
   mi_track_malloc(p, size, zero);  
diff --git a/src/arena.c b/src/arena.c
index 24835f42..9923eae1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -285,7 +285,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = MI_ARENA_MIN_SIZE;   
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;
   const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
@@ -302,7 +302,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
   // and try to reserve the arena
-  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); 
+  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
     // failed, try a smaller size?
     const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
@@ -624,7 +624,23 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
     _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n");
   };
-  const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  size_t block_start;
+  #if MI_GUARDED
+  // in a guarded build, we aling pages with blocks a multiple of an OS page size, to the OS page size
+  // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages)
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(MI_PAGE_ALIGN >= os_page_size);
+  if (block_size % os_page_size == 0) {
+    block_start = _mi_align_up(MI_PAGE_INFO_SIZE, os_page_size);
+  }
+  else
+  #endif
+  if (os_align) {
+    block_start = MI_PAGE_ALIGN;
+  }
+  else {
+    block_start = MI_PAGE_INFO_SIZE;
+  }
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
   page->reserved = (uint16_t)reserved;
diff --git a/src/init.c b/src/init.c
index 19e111d3..57be59a8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -180,7 +180,7 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
   if (heap->guarded_sample_rate >= 1) {
     heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
   }
-  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+  heap->guarded_sample_count = 1 + heap->guarded_sample_seed;  // count down samples
 }
 
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
diff --git a/src/libc.c b/src/libc.c
index eed63d87..0ec2164d 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -84,8 +84,8 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) {
 // This is mostly to avoid calling these when libc is not yet
 // initialized (and to reduce dependencies)
 //
-// format:      d i, p x u, s
-// prec:        z l ll L
+// format:      d i, p, x, u, s
+// type:        z l ll L
 // width:       10
 // align-left:  -
 // fill:        0
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 2e7f1aca..410764bd 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -233,8 +233,8 @@ static void test_heap_walk(void) {
 }
 
 static void test_canary_leak(void) {
-  char* p = mi_mallocn_tp(char, 23);
-  for (int i = 0; i < 23; i++) {
+  char* p = mi_mallocn_tp(char, 22);
+  for (int i = 0; i < 22; i++) {
     p[i] = '0'+i;
   }
   puts(p);
diff --git a/test/test-stress.c b/test/test-stress.c
index 915c953f..0488fc2b 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -42,7 +42,7 @@ static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
 static int THREADS = 4;
-static int SCALE   = 100;
+static int SCALE   = 10;
 static int ITER    = 10;
 #define ALLOW_LARGE false
 #elif 0

From 64c4181ffa63e21b644f4f06d42279bfd4e82cf1 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 10 Dec 2024 20:32:48 -0800
Subject: [PATCH 068/264] better block alignment

---
 include/mimalloc/internal.h |  5 +++++
 include/mimalloc/types.h    |  7 ++++---
 src/alloc-aligned.c         |  4 +++-
 src/arena.c                 | 19 +++++++++++--------
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 8a61a58e..5c5afca0 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -479,11 +479,16 @@ static inline uint8_t* mi_page_start(const mi_page_t* page) {
   return page->page_start;
 }
 
+
 static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
   if (size) { *size = mi_page_block_size(page) * page->reserved; }
   return mi_page_start(page);
 }
 
+static inline size_t mi_page_info_size(void) {
+  return _mi_align_up(sizeof(mi_page_t), MI_MAX_ALIGN_SIZE);
+}
+
 static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) {
   size_t psize;
   uint8_t* start = mi_page_area(page, &psize);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 71edb397..dc1c93fe 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -323,13 +323,14 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
-#define MI_PAGE_MIN_BLOCK_ALIGN           MI_SIZE_BITS         // minimal block alignment in a page (64b on 64-bit, 32b on 32-bit)
+#define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks 
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
 #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
-#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*32)  // 160    >= sizeof(mi_page_t)
 #else
-#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*32)  // 128/96 >= sizeof(mi_page_t)
 #endif
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 38e0371d..c36ce0af 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -20,7 +20,9 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
   mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
   if (alignment > size) return false;
   const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_PAGE_MIN_BLOCK_ALIGN && (bsize & (alignment-1)) == 0);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize));
+  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
+  return ok;
 }
 
 #if MI_GUARDED
diff --git a/src/arena.c b/src/arena.c
index 9923eae1..a05e1f5d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -621,25 +621,28 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     }
   }
   #endif
-  if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
-    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n");
-  };
+  mi_assert(MI_PAGE_INFO_SIZE >= mi_page_info_size());
   size_t block_start;
   #if MI_GUARDED
-  // in a guarded build, we aling pages with blocks a multiple of an OS page size, to the OS page size
+  // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size
   // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages)
   const size_t os_page_size = _mi_os_page_size();
   mi_assert_internal(MI_PAGE_ALIGN >= os_page_size);
-  if (block_size % os_page_size == 0) {
-    block_start = _mi_align_up(MI_PAGE_INFO_SIZE, os_page_size);
+  if (block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) {
+    block_start = _mi_align_up(_mi_page_info_size(), os_page_size);
   }
   else
   #endif
   if (os_align) {
     block_start = MI_PAGE_ALIGN;
   }
+  else if (_mi_is_power_of_two(block_size) && block_size <= MI_PAGE_MAX_START_BLOCK_ALIGN2) {
+    // naturally align all power-of-2 blocks
+    block_start = _mi_align_up(mi_page_info_size(), block_size);
+  }
   else {
-    block_start = MI_PAGE_INFO_SIZE;
+    // otherwise start after the info
+    block_start = mi_page_info_size();
   }
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
@@ -691,7 +694,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si
   const mi_arena_id_t  req_arena_id = heap->arena_id;
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
-  const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
   const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
 
   mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);

From 24d3c1bc14b0286607f764d6dda8b1c55e2ad40d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 09:16:28 -0800
Subject: [PATCH 069/264] heap meta data always uses mi_meta_zalloc

---
 include/mimalloc/internal.h |  1 +
 src/alloc-aligned.c         |  2 ++
 src/heap.c                  | 62 ++++++++++++++++++++++---------------
 src/init.c                  | 16 +++-------
 4 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 5c5afca0..a2e1d5d7 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -186,6 +186,7 @@ size_t      _mi_bin_size(uint8_t bin);           // for stats
 uint8_t     _mi_bin(size_t size);                // for stats
 
 // "heap.c"
+mi_heap_t*  _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld);
 void        _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld);
 void        _mi_heap_destroy_pages(mi_heap_t* heap);
 void        _mi_heap_collect_abandon(mi_heap_t* heap);
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index c36ce0af..14cbee45 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -183,6 +183,8 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   #endif
 
   // try first if there happens to be a small block available with just the right alignment
+  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
+  // naturally aligned this can be often the case.
   if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
     const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
     const size_t padsize = size + MI_PADDING_SIZE;
diff --git a/src/heap.c b/src/heap.c
index 1b5d14b4..837e7cd8 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -182,19 +182,25 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) {
+// todo: make order of parameters consistent (but would that break compat with CPython?)
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) 
+{
+  mi_assert_internal(heap!=NULL);  
+  mi_memid_t memid = heap->memid;
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;  // avoid reading the thread-local tld during initialization
+  heap->memid = memid;
+  heap->tld        = tld;  // avoid reading the thread-local tld during initialization
   heap->thread_id  = _mi_thread_id();
   heap->arena_id   = arena_id;
   heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
-  heap->tag        = tag;
+  heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
     // (but abandoning is good in this case)
     heap->allow_page_reclaim = false;
   }
+  
   if (heap->tld->heap_backing == NULL) {
     heap->tld->heap_backing = heap;  // first heap becomes the backing heap
     _mi_random_init(&heap->random);
@@ -206,18 +212,31 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
   _mi_heap_guarded_init(heap);
+
   // push on the thread local heaps list
   heap->next = heap->tld->heaps;
   heap->tld->heaps = heap;
 }
 
+mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld) {
+  mi_assert_internal(tld!=NULL);
+  mi_assert(heap_tag >= 0 && heap_tag < 256);
+  // allocate and initialize a heap
+  mi_memid_t memid;
+  mi_heap_t* heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  if (heap==NULL) {
+    _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
+    return NULL;
+  }
+  heap->memid = memid;
+  _mi_heap_init(heap, arena_id, allow_destroy, (uint8_t)heap_tag, tld);
+  return heap;
+}
+
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap == NULL) return NULL;
-  mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, bheap->tld);
-  return heap;
+  mi_assert_internal(bheap != NULL);
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);  
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@@ -276,7 +295,7 @@ static void mi_heap_free(mi_heap_t* heap) {
   mi_assert_internal(heap->tld->heaps != NULL);
 
   // and free the used memory
-  mi_free(heap);
+  _mi_meta_free(heap, sizeof(*heap), heap->memid);
 }
 
 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@@ -402,13 +421,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   mi_assert_internal(heap!=NULL);
   if (from==NULL || from->page_count == 0) return;
 
-  // reduce the size of the delayed frees
-  // _mi_heap_delayed_free_partial(from);
-
   // transfer all pages by appending the queues; this will set a new heap field
-  // so threads may do delayed frees in either heap for a while.
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
-  // so after this only the new heap will get delayed frees
   for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_queue_t* append = &from->pages[i];
@@ -418,19 +431,17 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   }
   mi_assert_internal(from->page_count == 0);
 
-  // and do outstanding delayed frees in the `from` heap
-  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
-  // the regular `_mi_free_delayed_block` which is safe.
-  //_mi_heap_delayed_free_all(from);
-  //#if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
-  //  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
-  //#endif
-
   // and reset the `from` heap
   mi_heap_reset_pages(from);
 }
 
+// are two heaps compatible with respect to heap-tag, exclusive arena etc.
+static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
+  return (heap1->tag == heap2->tag &&                   // store same kind of objects
+          heap1->tld->subproc == heap2->tld->subproc && // same sub-process
+          heap1->arena_id == heap2->arena_id);          // same arena preference
+}
+
 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
 {
@@ -439,7 +450,8 @@ void mi_heap_delete(mi_heap_t* heap)
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
-  if (!mi_heap_is_backing(heap)) {
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  if (heap != bheap && mi_heaps_are_compatible(bheap,heap)) {
     // transfer still used pages to the backing heap
     mi_heap_absorb(heap->tld->heap_backing, heap);
   }
diff --git a/src/init.c b/src/init.c
index 57be59a8..ae1ae086 100644
--- a/src/init.c
+++ b/src/init.c
@@ -345,25 +345,19 @@ static bool _mi_thread_heap_init(void) {
   }
   else {
     // allocates tld data 
-    // note: we cannot access thread-locals yet as that can cause (recursive) allocation (on macOS <= 14 for 
-    // example where the loader allocates thread-local data on demand).
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation 
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
     mi_tld_t* tld = mi_tld_alloc();  
     
     // allocate and initialize the heap
-    mi_memid_t memid;
-    mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid));
-    if (heap==NULL || tld==NULL) {
-      _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
-      return false;
-    }
-    heap->memid = memid;
-    _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */, tld);
+    mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
 
     // associate the heap with this thread
     // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation)
     _mi_heap_set_default_direct(heap);
+
     // now that the heap is set for this thread, we can set the thread-local tld.
-    mi_tld = tld; 
+    mi_tld = tld;
   }
   return false;
 }

From 565656919ed57b9530e9c23c64bb7b5e4b0a47b3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 13:04:37 -0800
Subject: [PATCH 070/264] fix comments in types; fix guarded alignment bug

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 include/mimalloc/internal.h |  9 +++-
 include/mimalloc/types.h    | 95 +++++++++++++++----------------------
 src/arena.c                 | 47 ++++++++----------
 src/init.c                  |  5 +-
 5 files changed, 67 insertions(+), 91 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 3f1280ee..34bb28fe 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -116,7 +116,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index a2e1d5d7..3c5bd486 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -581,7 +581,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
   return (page->free != NULL);
 }
 
-
+  
 // is the page not yet used up to its reserved space?
 static inline bool mi_page_is_expandable(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
@@ -714,6 +714,12 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
   Guarded objects
 ------------------------------------------------------------------- */
 #if MI_GUARDED
+
+// we always align guarded pointers in a block at an offset
+// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
+#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
+#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
+
 static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) {
   const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block;
   return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED);
@@ -895,6 +901,7 @@ static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, si
   return memid;
 }
 
+
 // -------------------------------------------------------------------
 // Fast "random" shuffle
 // -------------------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index dc1c93fe..cc64a400 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -97,16 +97,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
-// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
-// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
-// the owning thread).
-// #define MI_HUGE_PAGE_ABANDON 1
-
-
 // ------------------------------------------------------
-// Main internal data-structures
+// Sizes of internal data-structures
 // ------------------------------------------------------
 
 // Sizes are for 64-bit
@@ -145,21 +137,32 @@ terms of the MIT license. A copy of the license can be found in the file
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 
+// ------------------------------------------------------
+// Arena's are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_ARENA_SLICE_SIZE slices of memory for the
+// mimalloc pages.
+// ------------------------------------------------------
+
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;     // defined in `arena.c`
+
 
 // ---------------------------------------------------------------
 // a memory id tracks the provenance of arena/OS allocated memory
 // ---------------------------------------------------------------
 
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated. 
+// The memid keeps track of this.
 typedef enum mi_memkind_e {
   MI_MEM_NONE,      // not allocated
   MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_META,      // allocated with the meta data allocator
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main heap data for example (`init.c`))
+  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
   MI_MEM_OS,        // allocated from the OS
   MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
   MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
+  MI_MEM_ARENA      // allocated from an arena (the usual case) (`arena.c`)
 } mi_memkind_t;
 
 static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
@@ -178,10 +181,9 @@ typedef struct mi_memid_os_info {
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
-  uint32_t      slice_index;        // base index in the arena
+  mi_arena_t*   arena;              // arena that contains this memory
+  uint32_t      slice_index;        // slice index in the arena
   uint32_t      slice_count;        // allocated slices
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
 
 typedef struct mi_memid_meta_info {
@@ -196,10 +198,10 @@ typedef struct mi_memid_s {
     mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
     mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
   } mem;
+  mi_memkind_t  memkind;
   bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
   bool          initially_committed;// `true` if the memory was originally allocated as committed
   bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
 } mi_memid_t;
 
 
@@ -227,32 +229,21 @@ typedef struct mi_block_s {
   mi_encoded_t next;
 } mi_block_t;
 
-#if MI_GUARDED
-// we always align guarded pointers in a block at an offset
-// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
-#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
-#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
-#endif
-
-
-// The owned flags are used for efficient multi-threaded free-ing
-// When we push on the page thread free queue of an abandoned page,
-// we also atomically get to own it. This is needed to atomically
-// abandon a page (while other threads could concurrently free blocks in it).
-typedef enum mi_owned_e {
-  MI_OWNED              = 0, // some heap owns this page
-  MI_ABANDONED          = 1, // the page is abandoned
-} mi_owned_t;
-
 
 // The `in_full` and `has_aligned` page flags are put in the same field
 // to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+// `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
+// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
 #define MI_PAGE_IN_FULL_QUEUE  MI_ZU(0x01)
 #define MI_PAGE_HAS_ALIGNED    MI_ZU(0x02)
 typedef size_t mi_page_flags_t;
 
 // Thread free list.
-// We use the bottom bit of the pointer for `mi_owned_t` flags
+// Points to a list of blocks that are freed by other threads.
+// The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// Ownership is required before we can read any non-atomic fields in the page.
+// This way we can push a block on the thread free list and try to claim ownership
+// atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 
 // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
@@ -276,19 +267,17 @@ typedef uint8_t mi_heaptag_t;
 //
 // We don't count `freed` (as |free|) but use `used` to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
-//
+// 
 // Notes:
-// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
+// - If a page is not part of a heap it is called "abandoned" -- in
+//   that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that
+//   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
+// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
-//   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
-//   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to
-//   the owning heap `thread_delayed_free` list. This guarantees that pages
-//   will be freed correctly even if only other threads free blocks.
+
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned)
 
   mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
   uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
@@ -299,7 +288,7 @@ typedef struct mi_page_s {
 
   mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
-  _Atomic(mi_page_flags_t)  xflags;            // `in_full` and `has_aligned` flags
+  _Atomic(mi_page_flags_t)  xflags;            // `in_full_queue` and `has_aligned` flags
 
   size_t                    block_size;        // size available in each block (always `>0`)
   uint8_t*                  page_start;        // start of the blocks
@@ -355,7 +344,7 @@ typedef enum mi_page_kind_e {
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
   MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
   MI_PAGE_SINGLETON // page containing a single block.
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
+                    // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
 } mi_page_kind_t;
 
 
@@ -366,7 +355,7 @@ typedef enum mi_page_kind_e {
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
+// 
 // Per thread, there is always a default heap that is
 // used for allocation; it is initialized to statically
 // point to an empty heap to avoid initialization checks
@@ -436,16 +425,6 @@ struct mi_heap_s {
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
 
-// ------------------------------------------------------
-// Arena's
-// These are large reserved areas of memory allocated from
-// the OS that are managed by mimalloc to efficiently
-// allocate MI_SLICE_SIZE slices of memory for the
-// mimalloc pages.
-// ------------------------------------------------------
-
-// A large memory arena where pages are allocated in.
-typedef struct mi_arena_s mi_arena_t;
 
 // ------------------------------------------------------
 // Debug
diff --git a/src/arena.c b/src/arena.c
index a05e1f5d..c9d21c75 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -35,14 +35,13 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
-  mi_arena_id_t       id;                   // arena id; 0 for non-specific
-
+  mi_arena_id_t       id;                   // arena id (> 0 where `arena == arenas[arena->id - 1]`)
+  
   size_t              slice_count;          // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
   int                 numa_node;            // associated NUMA node
-  bool                exclusive;            // only allow allocations if specifically for this arena
+  bool                is_exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
-  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices should be decommitted from `slices_decommit`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
@@ -93,7 +92,8 @@ static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclus
 
 bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
   if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+    const mi_arena_t* arena = memid.mem.arena.arena;
+    return mi_arena_id_is_suitable(arena->id, arena->is_exclusive, request_arena_id);
   }
   else {
     return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
@@ -152,34 +152,25 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
 
 
 // Create an arena memid
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t slice_index, size_t slice_count) {
+static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(slice_index < UINT32_MAX);
   mi_assert_internal(slice_count < UINT32_MAX);
   mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
+  memid.mem.arena.arena = arena;
   memid.mem.arena.slice_index = (uint32_t)slice_index;
-  memid.mem.arena.slice_count = (uint32_t)slice_count;
-  memid.mem.arena.is_exclusive = is_exclusive;
+  memid.mem.arena.slice_count = (uint32_t)slice_count;  
   return memid;
 }
 
-// returns if the arena is exclusive
-static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* slice_index, size_t* slice_count) {
+// get the arena and slice span
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) {
   mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  mi_arena_t* arena = memid.mem.arena.arena;
   if (slice_index) *slice_index = memid.mem.arena.slice_index;
   if (slice_count) *slice_count = memid.mem.arena.slice_count;
-  return memid.mem.arena.is_exclusive;
+  return arena;
 }
 
-// get the arena and slice index
-static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) {
-  size_t arena_index;
-  mi_arena_memid_indices(memid, &arena_index, slice_index, slice_count);
-  return mi_arena_from_index(arena_index);
-}
-
-
 static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* slice_count) {
   // todo: maybe store the arena* directly in the page?
   return mi_arena_from_memid(page->memid, slice_index, slice_count);
@@ -198,7 +189,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
   // claimed it!
   void* p = mi_arena_slice_start(arena, slice_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count);
+  *memid = mi_memid_create_arena(arena, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
 
   // set the dirty bits
@@ -323,7 +314,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 
 static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, int numa_node, bool allow_large) {
   if (!allow_large && arena->is_large) return false;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return false;
+  if (!mi_arena_id_is_suitable(arena->id, arena->is_exclusive, req_arena_id)) return false;
   if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity
     const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
     if (!numa_suitable) return false;
@@ -628,8 +619,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages)
   const size_t os_page_size = _mi_os_page_size();
   mi_assert_internal(MI_PAGE_ALIGN >= os_page_size);
-  if (block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) {
-    block_start = _mi_align_up(_mi_page_info_size(), os_page_size);
+  if (!os_align && block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) {
+    block_start = _mi_align_up(mi_page_info_size(), os_page_size);
   }
   else
   #endif
@@ -961,7 +952,7 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
-      mi_lock_done(&arena->abandoned_visit_lock);
+      // mi_lock_done(&arena->abandoned_visit_lock);
       if (mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
         _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
@@ -1085,13 +1076,13 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   // init
   arena->id           = _mi_arena_id_none();
   arena->memid        = memid;
-  arena->exclusive    = exclusive;
+  arena->is_exclusive    = exclusive;
   arena->slice_count  = slice_count;
   arena->info_slices  = info_slices;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
-  mi_lock_init(&arena->abandoned_visit_lock);
+  // mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
   uint8_t* base = mi_arena_start(arena) + bitmap_base;
diff --git a/src/init.c b/src/init.c
index ae1ae086..a5a0819e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -11,6 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
+#define MI_MEMID_STATIC  {{{NULL,0}}, MI_MEM_STATIC, true /* pinned */, true /* committed */, false /* zero */ }
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
@@ -34,7 +35,7 @@ const mi_page_t _mi_page_empty = {
   NULL,       // xheap
   NULL, NULL, // next, prev
   NULL,       // subproc
-  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
+  MI_MEMID_STATIC  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -96,8 +97,6 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-#define MI_MEMID_STATIC  {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
-
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
   // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free

From ab53a73cbdff604c0fa8b336030bd8d4e5f706a8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 14:29:06 -0800
Subject: [PATCH 071/264] small updates

---
 include/mimalloc/atomic.h   |  6 ++--
 include/mimalloc/internal.h | 55 +++++++++++++++++-----------
 include/mimalloc/prim.h     |  6 ++--
 include/mimalloc/track.h    |  6 ++--
 include/mimalloc/types.h    | 72 ++++++++++++++++---------------------
 5 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 3b0ff559..95c1aefd 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
+#define MI_ATOMIC_H
 
 // include windows.h or pthreads.h
 #if defined(_WIN32)
@@ -509,4 +509,4 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 
 
 
-#endif // __MIMALLOC_ATOMIC_H
+#endif // MI_ATOMIC_H
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3c5bd486..4b211d71 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_INTERNAL_H
-#define MIMALLOC_INTERNAL_H
+#ifndef MI_INTERNAL_H
+#define MI_INTERNAL_H
 
 
 // --------------------------------------------------------------------------
@@ -239,27 +239,42 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
 /* -----------------------------------------------------------
-  Error codes passed to `_mi_fatal_error`
-  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
-  For portability define undefined error codes using common Unix codes:
-  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+  Assertions
 ----------------------------------------------------------- */
-#include <errno.h>
-#ifndef EAGAIN         // double free
-#define EAGAIN (11)
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func);
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
 #endif
-#ifndef ENOMEM         // out of memory
-#define ENOMEM (12)
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
 #endif
-#ifndef EFAULT         // corrupted free-list or meta-data
-#define EFAULT (14)
-#endif
-#ifndef EINVAL         // trying to free an invalid pointer
-#define EINVAL (22)
-#endif
-#ifndef EOVERFLOW      // count*size overflow
-#define EOVERFLOW (75)
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
 #endif
 
 
@@ -1023,4 +1038,4 @@ static inline void _mi_memzero_aligned(void* dst, size_t n) {
 }
 
 
-#endif
+#endif  // MI_INTERNAL_H
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 65f65376..99791585 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_PRIM_H
-#define MIMALLOC_PRIM_H
+#ifndef MI_PRIM_H
+#define MI_PRIM_H
 
 
 // --------------------------------------------------------------------------
@@ -370,4 +370,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()
 
 
-#endif  // MIMALLOC_PRIM_H
+#endif  // MI_PRIM_H
diff --git a/include/mimalloc/track.h b/include/mimalloc/track.h
index 4b5709e2..199308a6 100644
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TRACK_H
-#define MIMALLOC_TRACK_H
+#ifndef MI_TRACK_H
+#define MI_TRACK_H
 
 /* ------------------------------------------------------------------------------------------------------
 Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
@@ -142,4 +142,4 @@ defined, undefined, or not accessible at all:
   }
 #endif
 
-#endif
+#endif // MI_TRACK_H
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index cc64a400..03d522b5 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TYPES_H
-#define MIMALLOC_TYPES_H
+#ifndef MI_TYPES_H
+#define MI_TYPES_H
 
 // --------------------------------------------------------------------------
 // This file contains the main type definitions for mimalloc:
@@ -21,12 +21,9 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "bits.h"     // bit ops, size defines
-#include "atomic.h"   // _Atomic
-
-#ifdef _MSC_VER
-#pragma warning(disable:4214) // bitfield is not int
-#endif
+#include <errno.h>    // error codes
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
+#include "atomic.h"   // _Atomic primitives
 
 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `sizeof(void*)`
@@ -351,6 +348,7 @@ typedef enum mi_page_kind_e {
 
 // ------------------------------------------------------
 // Heaps
+// 
 // Provide first-class heaps to allocate from.
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
@@ -426,40 +424,6 @@ struct mi_heap_s {
 };
 
 
-// ------------------------------------------------------
-// Debug
-// ------------------------------------------------------
-
-#if !defined(MI_DEBUG_UNINIT)
-#define MI_DEBUG_UNINIT     (0xD0)
-#endif
-#if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
-#endif
-#if !defined(MI_DEBUG_PADDING)
-#define MI_DEBUG_PADDING    (0xDE)
-#endif
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
 // ------------------------------------------------------
 // Statistics
 // ------------------------------------------------------
@@ -575,4 +539,28 @@ struct mi_tld_s {
   mi_stats_t          stats;            // statistics
 };
 
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
 #endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+#endif // MI_TYPES_H

From 1c8d15abac1a9d269c335be7eaab4a8bc23aaf09 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 14:30:44 -0800
Subject: [PATCH 072/264] fix build error

---
 include/mimalloc/internal.h | 14 --------------
 include/mimalloc/types.h    | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 4b211d71..fb359763 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -239,20 +239,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
-// ------------------------------------------------------
-// Debug
-// ------------------------------------------------------
-
-#if !defined(MI_DEBUG_UNINIT)
-#define MI_DEBUG_UNINIT     (0xD0)
-#endif
-#if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
-#endif
-#if !defined(MI_DEBUG_PADDING)
-#define MI_DEBUG_PADDING    (0xDE)
-#endif
-
 /* -----------------------------------------------------------
   Assertions
 ----------------------------------------------------------- */
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 03d522b5..77752398 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -562,5 +562,19 @@ struct mi_tld_s {
 #define EOVERFLOW (75)
 #endif
 
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#ifndef MI_DEBUG_UNINIT
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#ifndef MI_DEBUG_FREED
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#ifndef MI_DEBUG_PADDING
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
 
 #endif // MI_TYPES_H

From ccf5e36e6bb273d0633b0756067d4808008b0b8a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 16:26:39 -0800
Subject: [PATCH 073/264] use frac 8 for reclaim_on_free and reabandon; halve
 full_page_retain if running in a threadpool

---
 include/mimalloc/types.h | 1 +
 src/free.c               | 4 ++--
 src/heap.c               | 5 +++++
 src/init.c               | 4 ++++
 src/page.c               | 2 +-
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 77752398..f4bfa07a 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -409,6 +409,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   mi_memid_t            memid;                               // provenance of the heap struct itseft (meta or os)
+  long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
diff --git a/src/free.c b/src/free.c
index 49bf8bf6..14034593 100644
--- a/src/free.c
+++ b/src/free.c
@@ -219,7 +219,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   // 2. if the page is not too full, we can try to reclaim it for ourselves
   // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
   if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
-      !mi_page_is_used_at_frac(page,4) 
+      !mi_page_is_used_at_frac(page,8) 
       // && !mi_page_is_abandoned_mapped(page)
      )
   {
@@ -250,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   }
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!mi_page_is_used_at_frac(page,4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
     !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
     _mi_arena_page_try_reabandon_to_mapped(page))
   {
diff --git a/src/heap.c b/src/heap.c
index 70162d46..1d8142f7 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -194,11 +194,16 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   heap->arena_id   = arena_id;
   heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
+  heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
   heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
     // (but abandoning is good in this case)
     heap->allow_page_reclaim = false;
+    // and halve the full page retain (possibly to 0)
+    if (heap->full_page_retain >= 0) {
+      heap->full_page_retain = heap->full_page_retain / 2;
+    }
   }
   
   if (heap->tld->heap_backing == NULL) {
diff --git a/src/init.c b/src/init.c
index a5a0819e..85588970 100644
--- a/src/init.c
+++ b/src/init.c
@@ -109,6 +109,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
   MI_MEMID_STATIC,  // memid
+  0,                // full page retain
   false,            // can reclaim
   true,             // can eager abandon
   0,                // tag
@@ -155,6 +156,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
   MI_MEMID_STATIC,  // memid
+  2,                // full page retain
   true,             // allow page reclaim
   true,             // allow page abandon
   0,                // tag
@@ -224,6 +226,8 @@ static void mi_heap_main_init(void) {
     mi_lock_init(&mi_subproc_default.abandoned_os_lock);
     mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
     _mi_heap_guarded_init(&_mi_heap_main);
+    _mi_heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
+    _mi_heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index 98319e53..a90c1d7d 100644
--- a/src/page.c
+++ b/src/page.c
@@ -642,7 +642,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   size_t count = 0;
   #endif
   long candidate_limit = 0;          // we reset this on the first candidate to limit the search
-  long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain);
+  long full_page_retain = heap->full_page_retain;
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 

From aed76f29100302d3bbb9da7f2dfa75cd78f167e7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 20:34:23 -0800
Subject: [PATCH 074/264] wip: allow arena (re)loading

---
 ide/vs2022/mimalloc.vcxproj |   2 +-
 include/mimalloc.h          |  11 ++
 include/mimalloc/internal.h |   1 -
 src/arena.c                 | 262 ++++++++++++++++++++----------------
 src/bitmap.c                |  35 +++++
 src/bitmap.h                |   6 +
 src/os.c                    |  47 ++++++-
 7 files changed, 244 insertions(+), 120 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 34bb28fe..d8cc25b1 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -116,7 +116,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/include/mimalloc.h b/include/mimalloc.h
index c11353b7..97f74c83 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -317,6 +317,17 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max);
 
 
+// experimental
+//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size);
+//mi_decl_export void  mi_os_free(void* p, size_t size);
+//mi_decl_export void  mi_os_commit(void* p, size_t size);
+//mi_decl_export void  mi_os_decommit(void* p, size_t size);
+
+mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
+mi_decl_export bool  mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id); 
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index fb359763..3be08b94 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -143,7 +143,6 @@ void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t m
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
 void        _mi_arena_init(void);
-void        _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid);
 void*       _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
 void*       _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
 bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
diff --git a/src/arena.c b/src/arena.c
index c9d21c75..03f40932 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -176,6 +176,17 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* s
   return mi_arena_from_memid(page->memid, slice_index, slice_count);
 }
 
+static size_t mi_memid_size(mi_memid_t memid) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE;
+  }
+  else if (mi_memid_is_os(memid) || memid.memkind == MI_MEM_EXTERNAL) {
+    return memid.mem.os.size;
+  }
+  else {
+    return 0;
+  }
+}
 
 /* -----------------------------------------------------------
   Arena Allocation
@@ -727,7 +738,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
   return page;
 }
 
-
+static void mi_arena_free(void* p, size_t size, mi_memid_t memid);
 
 void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
@@ -754,7 +765,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   #endif
 
   _mi_page_map_unregister(page);
-  _mi_arena_free(page, 1, 1, page->memid);
+  mi_arena_free(page, mi_memid_size(page->memid), page->memid);
 }
 
 /* -----------------------------------------------------------
@@ -843,7 +854,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
   }
   else {
-    // page is full (or a singleton), page is OS/externally allocated
+    // page is full (or a singleton), page is OS/nly allocated
     // nothing to do
     // TODO: maintain count of these as well?
   }
@@ -863,22 +874,16 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
 static void mi_arenas_try_purge(bool force, bool visit_all);
 
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) {
-  mi_assert_internal(size > 0);
-  mi_assert_internal(committed_size <= size);
+static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
+  mi_assert_internal(size >= 0);
   if (p==NULL) return;
   if (size==0) return;
-  const bool all_committed = (committed_size == size);
-
+  
   // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
   mi_track_mem_undefined(p, size);
 
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-    }
     _mi_os_free(p, size, memid);
   }
   else if (memid.memkind == MI_MEM_ARENA) {
@@ -886,7 +891,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     size_t slice_count;
     size_t slice_index;
     mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count);
-    mi_assert_internal(size==1);
+    mi_assert_internal((size%MI_ARENA_SLICE_SIZE)==0);
+    mi_assert_internal((slice_count*MI_ARENA_SLICE_SIZE)==size);
     mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p);
     mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p);
     // checks
@@ -902,25 +908,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     }
 
     // potentially decommit
-    if (arena->memid.is_pinned || arena->memid.initially_committed) {
-      mi_assert_internal(all_committed);
-    }
-    else {
-      /*
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count);
-        mi_track_mem_noaccess(p, size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      */
+    if (!arena->memid.is_pinned && !arena->memid.initially_committed) { // todo: maybe allow decommit even if initially committed?
       // (delay) purge the entire range
       mi_arena_schedule_purge(arena, slice_index, slice_count);
     }
@@ -944,6 +932,29 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   mi_arenas_try_purge(false, false);
 }
 
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_arena_get_count();
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+
+/* -----------------------------------------------------------
+  Remove an arena.
+----------------------------------------------------------- */
+
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 static void mi_arenas_unsafe_destroy(void) {
@@ -953,8 +964,8 @@ static void mi_arenas_unsafe_destroy(void) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
       // mi_lock_done(&arena->abandoned_visit_lock);
-      if (mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+      mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+      if (mi_memkind_is_os(arena->memid.memkind)) {        
         _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
       }
     }
@@ -965,10 +976,6 @@ static void mi_arenas_unsafe_destroy(void) {
   mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
 }
 
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
-}
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
@@ -977,18 +984,6 @@ void _mi_arena_unsafe_destroy_all(void) {
   _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
 
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_arena_get_count();
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
-      return true;
-    }
-  }
-  return false;
-}
-
 
 /* -----------------------------------------------------------
   Add an arena.
@@ -999,7 +994,26 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
   mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = -1; }
 
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  // first try to find a NULL entry
+  const size_t count = mi_arena_get_count();
+  size_t i;
+  for (i = 0; i < count; i++) {
+    if (mi_arena_from_index(i) == NULL) {
+      arena->id = mi_arena_id_create(i);
+      mi_arena_t* expected = NULL;
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &mi_arenas[i], &expected, arena)) {
+        // success
+        if (arena_id != NULL) { *arena_id = arena->id; }
+        return true;
+      }
+      else {
+        arena->id = _mi_arena_id_none();
+      }
+    }
+  }
+
+  // otherwise increase the max
+  i = mi_atomic_increment_acq_rel(&mi_arena_count);
   if (i >= MI_MAX_ARENAS) {
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
@@ -1076,7 +1090,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   // init
   arena->id           = _mi_arena_id_none();
   arena->memid        = memid;
-  arena->is_exclusive    = exclusive;
+  arena->is_exclusive = exclusive;
   arena->slice_count  = slice_count;
   arena->info_slices  = info_slices;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
@@ -1116,6 +1130,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
 
 bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
   mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.mem.os.base = start;
+  memid.mem.os.size = size;
   memid.initially_committed = is_committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
@@ -1370,74 +1386,86 @@ static void mi_arenas_try_purge(bool force, bool visit_all) {
 }
 
 
-/* -----------------------------------------------------------
-  Special static area for mimalloc internal structures
-  to avoid OS calls (for example, for the subproc metadata (~= 721b))
------------------------------------------------------------ */
-
-#define MI_ARENA_STATIC_MAX  ((MI_INTPTR_SIZE/2)*MI_KiB)  // 4 KiB on 64-bit
-
-static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
-static mi_decl_cache_align _Atomic(size_t)mi_arena_static_top;
-
-static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
-  *memid = _mi_memid_none();
-  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
-  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
-
-  // try to claim space
-  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
-  const size_t oversize = size + alignment - 1;
-  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
-  size_t top = oldtop + oversize;
-  if (top > MI_ARENA_STATIC_MAX) {
-    // try to roll back, ok if this fails
-    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
-    return NULL;
-  }
-
-  // success
-  *memid = _mi_memid_create(MI_MEM_STATIC);
-  memid->initially_zero = true;
-  const size_t start = _mi_align_up(oldtop, alignment);
-  uint8_t* const p = &mi_arena_static[start];
-  _mi_memzero_aligned(p, size);
-  return p;
-}
-
-void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
-  *memid = _mi_memid_none();
-
-  // try static
-  void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid);
-  if (p != NULL) return p;
-
-  // or fall back to the OS
-  p = _mi_os_alloc(size, memid);
-  if (p == NULL) return NULL;
-
-  // zero the OS memory if needed
-  if (!memid->initially_zero) {
-    _mi_memzero_aligned(p, size);
-    memid->initially_zero = true;
-  }
-  return p;
-}
-
-void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
-  if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid);
-  }
-  else {
-    mi_assert(memid.memkind == MI_MEM_STATIC);
-  }
-}
-
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
   MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
   _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n");
   return false;
 }
 
+
+/* -----------------------------------------------------------
+  Unloading and reloading an arena.
+----------------------------------------------------------- */
+
+mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
+  const size_t count = mi_arena_get_count();
+  const size_t arena_idx = mi_arena_id_index(arena_id);
+  if (count <= arena_idx) {
+    _mi_warning_message("arena id is invalid (%zu)\n", arena_id);
+    return false;
+  }
+  mi_arena_t* arena = mi_arena_from_id(arena_id);
+  if (arena==NULL) {
+    return false;
+  }
+  else if (!arena->is_exclusive) {
+    _mi_warning_message("cannot unload a non-exclusive arena (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+  else if (arena->memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+  if (base != NULL) { *base = (void*)arena; }
+  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }
+  if (accessed_size != NULL) {
+    // scan the commit map for the highest entry
+    size_t idx;
+    if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
+      *accessed_size = (idx + 1)* MI_ARENA_SLICE_SIZE;
+    }
+    else {
+      *accessed_size = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;
+    }
+  }
+  
+  // set the entry to NULL
+  mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL);
+  if (arena_idx + 1 == count) { // try adjust the count?
+    size_t expected = count;
+    mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, count-1);
+  }
+  return true;
+}
+
+bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) {
+  // assume the memory area is already containing the arena
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+  if (start == NULL || size == 0) return false;
+  mi_arena_t* arena = (mi_arena_t*)start;
+  mi_memid_t memid = arena->memid;
+  if (memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only reload arena's from external memory (%p)\n", arena);
+    return false;
+  }
+  if (memid.mem.os.base != start) {
+    _mi_warning_message("the reloaded arena base address differs from the external memory (arena: %p, external: %p)\n", arena, start);
+    return false;
+  }
+  if (memid.mem.os.size != size) {
+    _mi_warning_message("the reloaded arena size differs from the external memory (arena size: %zu, external size: %zu)\n", arena->memid.mem.os.size, size);
+    return false;
+  }
+  if (!arena->is_exclusive) {
+    _mi_warning_message("the reloaded arena is not exclusive\n");
+    return false;
+  }
+  arena->memid.is_pinned = is_large;
+  arena->memid.initially_committed = is_committed;
+  arena->memid.initially_zero = is_zero;
+  arena->is_exclusive = true;
+  arena->is_large = is_large;
+  arena->id = _mi_arena_id_none();
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+}
+
diff --git a/src/bitmap.c b/src/bitmap.c
index 2f563066..d16a1b24 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -796,6 +796,20 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   #endif
 }
 
+
+static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
+  for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
+    i--;
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t idx;
+    if (mi_bsr(b, &idx)) {
+      *pidx = (i*MI_BFIELD_BITS) + idx;
+      return true;
+    }
+  }
+  return false;
+}
+
 /* --------------------------------------------------------------------------------
  bitmap chunkmap
 -------------------------------------------------------------------------------- */
@@ -1154,6 +1168,27 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
   return false;
 }
 
+
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) {
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = chunkmap_max; i > 0; ) {
+    i--;
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    if (mi_bsr(cmap,&cmap_idx)) {
+      // highest chunk
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      size_t cidx; 
+      if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) {
+        *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
 // Clear a bit once it is set.
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
diff --git a/src/bitmap.h b/src/bitmap.h
index 191b6864..71a016ee 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -206,4 +206,10 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
 // allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 
+
+// If a bit is set in the bitmap, return `true` and set `idx` to its index.
+// Otherwise return `false` (and `*idx` is undefined).
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
+
+
 #endif // MI_BITMAP_H
diff --git a/src/os.c b/src/os.c
index 55f7428e..9fcd5aed 100644
--- a/src/os.c
+++ b/src/os.c
@@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-
+  
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -671,3 +671,48 @@ int _mi_os_numa_node_get(void) {
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
   return (int)numa_node;
 }
+
+
+/* ----------------------------------------------------------------------------
+  Public API
+-----------------------------------------------------------------------------*/
+
+mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
+  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
+}
+
+static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  mi_memid_t memid = _mi_memid_none();
+  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
+  if (p == NULL) return p;
+  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
+  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
+  if (base != NULL) { *base = memid.mem.os.base;  }
+  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
+  if (!memid.initially_zero && memid.initially_committed) {
+    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
+  }
+  return p;
+}
+
+mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
+}
+
+mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
+}
+
+mi_decl_export void  mi_os_free(void* p, size_t size) {
+  if (p==NULL || size == 0) return;
+  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
+  _mi_os_free(p, size, memid);
+}
+
+mi_decl_export void  mi_os_commit(void* p, size_t size) {
+  _mi_os_commit(p, size, NULL);
+}
+
+mi_decl_export void  mi_os_decommit(void* p, size_t size) {
+  _mi_os_decommit(p, size);
+}

From 94ce342ea9114c368e88eb96a485a1911d9ce5af Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 11 Dec 2024 22:06:25 -0800
Subject: [PATCH 075/264] maintain pages set for arenas; improve arena
 load/unload

---
 include/mimalloc/internal.h |  9 +++--
 src/arena.c                 | 68 +++++++++++++++++++++++++++++--------
 src/bitmap.c                | 23 ++++++++++++-
 src/bitmap.h                |  5 +++
 src/os.c                    |  3 +-
 src/page-map.c              | 11 ++++--
 6 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3be08b94..ee7f1026 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -164,6 +164,7 @@ void        _mi_meta_free(void* p, size_t size, mi_memid_t memid);
 bool        _mi_page_map_init(void);
 void        _mi_page_map_register(mi_page_t* page);
 void        _mi_page_map_unregister(mi_page_t* page);
+void        _mi_page_map_unregister_range(void* start, size_t size);
 
 // "page.c"
 void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -437,14 +438,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 
 extern uint8_t* _mi_page_map;
 
+static inline uintptr_t _mi_page_map_index(const void* p) {
+  return (((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT);
+}
+
 static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
   #if 1
-  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const uintptr_t idx = _mi_page_map_index(p);
   const size_t ofs = _mi_page_map[idx];
   if (valid != NULL) *valid = (ofs != 0);
   return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT);
   #else
-  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const uintptr_t idx = _mi_page_map_index(p);
   const uintptr_t up   = idx << MI_ARENA_SLICE_SHIFT;
   __builtin_prefetch((void*)up);
   const size_t ofs = _mi_page_map[idx];
diff --git a/src/arena.c b/src/arena.c
index 03f40932..4f89a629 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -48,6 +48,7 @@ typedef struct mi_arena_s {
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
   mi_bitmap_t*        slices_purge;         // can the slice be purged? (slice in purge => slice in free)
   mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t*        pages;                // all registered pages
   mi_bitmap_t*        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
   // followed by the bitmaps (whose size depends on the arena size)
@@ -117,7 +118,13 @@ static size_t mi_arena_info_slices(mi_arena_t* arena) {
   return arena->info_slices;
 }
 
-
+#if MI_DEBUG > 1
+static bool mi_arena_has_page(mi_arena_t* arena, mi_page_t* page) {
+  return (page->memid.memkind == MI_MEM_ARENA &&
+          page->memid.mem.arena.arena == arena &&
+          mi_bitmap_is_setN(arena->pages, page->memid.mem.arena.slice_index, 1));
+}
+#endif
 
 /* -----------------------------------------------------------
   Util
@@ -551,10 +558,11 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
       mi_assert_internal(mi_page_is_owned(page));
       mi_assert_internal(mi_page_is_abandoned(page));
+      mi_assert_internal(mi_arena_has_page(arena,page));
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
       _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
       _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
-
+      
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
@@ -588,6 +596,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
     page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid);
+    if (page != NULL) {
+      mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
+      mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index);
+    }
   }
 
   // otherwise fall back to the OS
@@ -758,6 +770,7 @@ void _mi_arena_page_free(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
+    mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));    
     // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
     // be (temporarily) not true if the free happens while trying to reclaim
     // see `mi_arana_try_claim_abandoned`
@@ -765,6 +778,9 @@ void _mi_arena_page_free(mi_page_t* page) {
   #endif
 
   _mi_page_map_unregister(page);
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
+  }
   mi_arena_free(page, mi_memid_size(page->memid), page->memid);
 }
 
@@ -1104,6 +1120,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->slices_committed = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_purge = mi_arena_bitmap_init(slice_count,&base);
+  arena->pages = mi_arena_bitmap_init(slice_count, &base);
   for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
     arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base);
   }
@@ -1396,6 +1413,18 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi
 /* -----------------------------------------------------------
   Unloading and reloading an arena.
 ----------------------------------------------------------- */
+static bool mi_arena_page_register(size_t slice_index, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(arg);
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
+  _mi_page_map_register(page);
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  return true;
+}
+
+static bool mi_arena_pages_reregister(mi_arena_t* arena) {
+  return _mi_bitmap_forall_set(arena->pages, &mi_arena_page_register, arena, NULL);
+}
 
 mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
   const size_t count = mi_arena_get_count();
@@ -1416,18 +1445,23 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
     _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena);
     return false;
   }
-  if (base != NULL) { *base = (void*)arena; }
-  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }
-  if (accessed_size != NULL) {
-    // scan the commit map for the highest entry
-    size_t idx;
-    if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
-      *accessed_size = (idx + 1)* MI_ARENA_SLICE_SIZE;
-    }
-    else {
-      *accessed_size = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;
-    }
+
+  // find accessed size
+  size_t asize;
+  // scan the commit map for the highest entry
+  size_t idx;
+  if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
+    asize = (idx + 1)* MI_ARENA_SLICE_SIZE;
   }
+  else {
+    asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;
+  }
+  if (base != NULL) { *base = (void*)arena; }
+  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }  
+  if (accessed_size != NULL) { *accessed_size = asize; }
+
+  // unregister the pages 
+  _mi_page_map_unregister_range(arena, asize);
   
   // set the entry to NULL
   mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL);
@@ -1438,7 +1472,7 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   return true;
 }
 
-bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) {
+mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) {
   // assume the memory area is already containing the arena
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
   if (start == NULL || size == 0) return false;
@@ -1466,6 +1500,10 @@ bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large,
   arena->is_exclusive = true;
   arena->is_large = is_large;
   arena->id = _mi_arena_id_none();
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+  if (!mi_arena_add(arena, arena_id, &_mi_stats_main)) {
+    return false;
+  }
+  mi_arena_pages_reregister(arena);
+  return true;
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index d16a1b24..f1b1a759 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1051,7 +1051,6 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
-  MI_UNUSED(tseq); \
   const size_t chunk_max_acc       = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
   const size_t chunk_start         = tseq % chunk_max_acc; /* space out threads? */ \
   const size_t chunkmap_max        = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
@@ -1197,3 +1196,25 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
 }
+
+
+// Visit all set bits in a bitmap.
+// todo: optimize further? maybe popcount to help the branch predictor for the loop,
+// and keep b constant (using a mask)? or avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  mi_bitmap_forall_chunks(bitmap, 0, chunk_idx) {
+    mi_bchunk_t* chunk = &bitmap->chunks[chunk_idx];
+    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+      const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+      mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+      size_t bidx;
+      while (mi_bsf(b, &bidx)) {
+        b = b & (b-1);  // clear low bit
+        const size_t idx = base_idx + bidx;
+        if (!visit(idx, arena, arg)) return false;
+      }
+    }
+  }
+  mi_bitmap_forall_chunks_end();
+  return true;
+}
diff --git a/src/bitmap.h b/src/bitmap.h
index 71a016ee..7fd09f43 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -212,4 +212,9 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
 
 
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, mi_arena_t* arena, void* arg2);
+
+// Visit all set bits in a bitmap
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
 #endif // MI_BITMAP_H
diff --git a/src/os.c b/src/os.c
index 9fcd5aed..86ecb16b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -676,7 +676,7 @@ int _mi_os_numa_node_get(void) {
 /* ----------------------------------------------------------------------------
   Public API
 -----------------------------------------------------------------------------*/
-
+#if 0
 mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
   return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
 }
@@ -716,3 +716,4 @@ mi_decl_export void  mi_os_commit(void* p, size_t size) {
 mi_decl_export void  mi_os_decommit(void* p, size_t size) {
   _mi_os_decommit(p, size);
 }
+#endif
diff --git a/src/page-map.c b/src/page-map.c
index 181db7f0..7b74c711 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -15,6 +15,7 @@ static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
 static void*       mi_page_map_max_address = NULL;
 static mi_memid_t  mi_page_map_memid;
 
+
 // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
 static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
                                           { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
@@ -84,7 +85,7 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t*
   *page_start = mi_page_area(page, &page_size);
   if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
   *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
-  return ((uintptr_t)page >> MI_ARENA_SLICE_SHIFT);
+  return _mi_page_map_index(page);
 }
 
 
@@ -113,16 +114,20 @@ void _mi_page_map_register(mi_page_t* page) {
 
 void _mi_page_map_unregister(mi_page_t* page) {
   mi_assert_internal(_mi_page_map != NULL);
-
   // get index and count
   uint8_t* page_start;
   size_t   slice_count;
   const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
-
   // unset the offsets
   _mi_memzero(_mi_page_map + idx, slice_count);
 }
 
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  const uintptr_t index = _mi_page_map_index(start);
+  mi_page_map_ensure_committed(index, slice_count); // we commit the range in total; todo: scan the commit bits and clear only those ranges?
+  _mi_memzero(&_mi_page_map[index], slice_count);
+}
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   // if mi_unlikely(_mi_page_map==NULL) {  // happens on macOS during loading

From 118bd8c97f9b7d189d89b671b49a66f58ded2ad6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 16:37:31 -0800
Subject: [PATCH 076/264] space out threads when searching for free pages

---
 CMakeLists.txt |  13 ++-
 src/arena.c    |  19 ++--
 src/bitmap.c   | 236 ++++++++++++++++++++++++++++++++++---------------
 3 files changed, 183 insertions(+), 85 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e96ff089..fa35d749 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,7 @@ option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a smal
 option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
 option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
+option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
@@ -227,7 +228,7 @@ endif()
 if(MI_SEE_ASM)
   message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
   list(APPEND mi_cflags -save-temps)
-  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14)
     message(STATUS "No GNU Line marker")
     list(APPEND mi_cflags -Wno-gnu-line-marker)
   endif()
@@ -330,10 +331,10 @@ endif()
 # Determine architecture
 set(MI_OPT_ARCH_FLAGS "")
 set(MI_ARCH "")
-if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR 
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR
    CMAKE_GENERATOR_PLATFORM STREQUAL "x64")          # msvc
   set(MI_ARCH "x64")
-elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR   
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR
        CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR    # apple
        CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")    # msvc
   set(MI_ARCH "arm64")
@@ -419,6 +420,12 @@ endif()
 if(MI_OPT_ARCH_FLAGS)
   list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
   message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
+  if (MI_OPT_SIMD)
+    list(APPEND mi_defines "MI_OPT_SIMD=1")
+    message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)")
+  endif()
+elseif(MI_OPT_SIMD)
+  message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)")
 endif()
 
 # extra needed libraries
diff --git a/src/arena.c b/src/arena.c
index 4f89a629..32c0b32e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
   mi_arena_id_t       id;                   // arena id (> 0 where `arena == arenas[arena->id - 1]`)
-  
+
   size_t              slice_count;          // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
   int                 numa_node;            // associated NUMA node
@@ -165,7 +165,7 @@ static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, s
   mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
   memid.mem.arena.arena = arena;
   memid.mem.arena.slice_index = (uint32_t)slice_index;
-  memid.mem.arena.slice_count = (uint32_t)slice_count;  
+  memid.mem.arena.slice_count = (uint32_t)slice_count;
   return memid;
 }
 
@@ -562,7 +562,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
       _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
       _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
-      
+
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
@@ -770,7 +770,7 @@ void _mi_arena_page_free(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
-    mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));    
+    mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
     // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
     // be (temporarily) not true if the free happens while trying to reclaim
     // see `mi_arana_try_claim_abandoned`
@@ -891,10 +891,9 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all);
 
 static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
-  mi_assert_internal(size >= 0);
   if (p==NULL) return;
   if (size==0) return;
-  
+
   // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
   mi_track_mem_undefined(p, size);
 
@@ -981,7 +980,7 @@ static void mi_arenas_unsafe_destroy(void) {
     if (arena != NULL) {
       // mi_lock_done(&arena->abandoned_visit_lock);
       mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-      if (mi_memkind_is_os(arena->memid.memkind)) {        
+      if (mi_memkind_is_os(arena->memid.memkind)) {
         _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
       }
     }
@@ -1457,12 +1456,12 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
     asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;
   }
   if (base != NULL) { *base = (void*)arena; }
-  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }  
+  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }
   if (accessed_size != NULL) { *accessed_size = asize; }
 
-  // unregister the pages 
+  // unregister the pages
   _mi_page_map_unregister_range(arena, asize);
-  
+
   // set the entry to NULL
   mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL);
   if (arena_idx + 1 == count) { // try adjust the count?
diff --git a/src/bitmap.c b/src/bitmap.c
index f1b1a759..4f21f68f 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -14,7 +14,9 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 
-#define MI_USE_SIMD   0
+#ifndef MI_OPT_SIMD
+#define MI_OPT_SIMD   0
+#endif
 
 /* --------------------------------------------------------------------------------
   bfields
@@ -24,11 +26,15 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) {
   return mi_ctz(x);
 }
 
-
 static inline size_t mi_bfield_popcount(mi_bfield_t x) {
   return mi_popcount(x);
 }
 
+static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) {
+  return (x & (x-1));
+}
+
+
 // find the least significant bit that is set (i.e. count trailing zero's)
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
@@ -156,16 +162,6 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b) {
 // ------- mi_bfield_atomic_try_xset ---------------------------------------
 
 
-// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
-// `all_clear` is set to true if the new bfield is zero (and false otherwise)
-static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = mi_bfield_one()<<idx;
-  const mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
-  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
-  return ((old&mask) == mask);
-}
-
 // Tries to  set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
 // and false otherwise (leaving the bit field as is).
 static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
@@ -194,6 +190,17 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf
 }
 
 
+// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+// `all_clear` is set to true if the new bfield is zero (and false otherwise)
+static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  return mi_bfield_atomic_try_clear_mask(b,mask,all_clear);
+  // const mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
+  // if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  // return ((old&mask) == mask);
+}
+
 // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
 // and false otherwise (leaving the bit field as is).
 static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear ) {
@@ -458,7 +465,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
 
 // ------- mi_bchunk_try_find_and_clear ---------------------------------------
 
-#if MI_USE_SIMD && defined(__AVX2__) 
+#if MI_OPT_SIMD && defined(__AVX2__)
 static inline __m256i mi_mm256_zero(void) {
   return _mm256_setzero_si256();
 }
@@ -493,7 +500,7 @@ static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t ch
 // This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
 static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
-  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
@@ -502,10 +509,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;    
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
     // try again
   }
-  #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     size_t chunk_idx = 0;
     #if 0
@@ -534,9 +541,9 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
-    chunk_idx = _tzcnt_u64(mask) / 8;
+    chunk_idx = mi_ctz(mask) / 8;
     #endif
-    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
     // try again
   }
   #else
@@ -551,12 +558,17 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
   #endif
 }
 
+static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear(chunk, pidx);
+}
 
+#if !MI_OPT_SIMD
 static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
   const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
   if (!allow_all_set && (~b == 0)) return false;
   // has_set8 has low bit in each byte set if the byte in x == 0xFF
-  const mi_bfield_t has_set8 = 
+  const mi_bfield_t has_set8 =
     ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
      (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
      >> 7;                           // shift high bit to low bit
@@ -573,13 +585,14 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
   }
   return false;
 }
+#endif
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
-  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -615,6 +628,10 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
   #endif
 }
 
+static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==8); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear8(chunk, pidx);
+}
 
 
 // find least bfield in a chunk with all bits set, and try unset it atomically
@@ -622,7 +639,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
 // Used to find large size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -658,6 +675,10 @@ static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
 #endif
 }
 
+static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==MI_BFIELD_BITS); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clearX(chunk, pidx);
+}
 
 // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
 // and try to clear them atomically.
@@ -783,10 +804,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
 
 // are all bits in a bitmap chunk clear?
 static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
-  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
   return mi_mm256_is_zero(vec);
-  #elif MI_USE_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   // a 64b cache-line contains the entire chunk anyway so load both at once
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
@@ -835,7 +856,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
   mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL);
   // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
   // bit in the mask. We check again to catch this situation.
-  if (!mi_bchunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) {
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
@@ -1043,11 +1064,129 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 
 
+/* --------------------------------------------------------------------------------
+  Iterate through a bfield
+-------------------------------------------------------------------------------- */
+
+// Cycle iteration through a bitfield. This is used to space out threads
+// so there is less chance of contention. When searching for a free page we
+// like to first search only the accessed part (so we reuse better). This
+// high point is called the `cycle`.
+//
+// We then iterate through the bitfield as:
+// first: [start, cycle>
+// then : [0, start>
+// then : [cycle, MI_BFIELD_BITS>
+//
+// The start is determined usually as `tseq % cycle` to have each thread
+// start at a different spot.
+// - We use `popcount` to improve branch prediction`
+// - The `cycle_mask` is the part `[start, cycle>`.
+#define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
+  mi_assert_internal(start <= cycle); \
+  mi_assert_internal(start < MI_BFIELD_BITS); \
+  mi_assert_internal(cycle < MI_BFIELD_BITS); \
+  mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
+  size_t _bcount##SUF = mi_bfield_popcount(bfield); \
+  mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
+  while(_bcount##SUF > 0) { \
+    _bcount##SUF--;\
+    if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \
+    size_t name_idx; \
+    bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \
+    mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \
+    { \
+
+#define mi_bfield_iterate_end(SUF) \
+    } \
+    _b##SUF = mi_bfield_clear_least_bit(_b##SUF); \
+  } \
+}
+
+#define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \
+  const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); \
+  mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF)
+
+#define mi_bfield_cycle_iterate_end(SUF) \
+  mi_bfield_iterate_end(SUF); }
+
+
 /* --------------------------------------------------------------------------------
   bitmap  try_find_and_clear
   (used to find free pages)
 -------------------------------------------------------------------------------- */
 
+
+typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
+
+static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear)
+{
+  // we space out threads to reduce contention
+  const size_t cmap_max_count  = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS);
+  const size_t chunk_acc       = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
+  const size_t cmap_acc        = chunk_acc / MI_BFIELD_BITS;
+  const size_t cmap_acc_bits   = 1 + (chunk_acc % MI_BFIELD_BITS);
+
+  // create a mask over the chunkmap entries to iterate over them efficiently
+  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
+  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
+  const size_t cmap_cycle      = cmap_acc+1;
+  mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
+  {
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]);
+    size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y)
+    {
+      mi_assert_internal(eidx <= MI_BFIELD_BITS);
+      const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
+      mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+      size_t cidx;
+      // if we find a spot in the chunk we are done
+      if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) {
+        *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+        mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
+        return true;
+      }
+      else {
+        /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
+        mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+      }
+    }
+    mi_bfield_cycle_iterate_end(Y);
+  }
+  mi_bfield_cycle_iterate_end(X);
+  return false;
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_);
+}
+
+
+/* --------------------------------------------------------------------------------
+  bitmap  try_find_and_claim
+  (used to allocate abandoned pages)
+-------------------------------------------------------------------------------- */
+
 #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
@@ -1084,53 +1223,6 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
     } \
   }}
 
-
-#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
-  mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
-    size_t _cidx; \
-    if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
-      *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
-      return true; \
-    } \
-    else { \
-      /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
-      mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
-    } \
-  } \
-  mi_bitmap_forall_chunks_end(); \
-  return false; \
-}
-
-#define COMMA ,
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
-  mi_assert_internal(n<=MI_BFIELD_BITS);
-  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
-}
-
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
-  mi_assert_internal(n<=MI_BCHUNK_BITS);
-  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
-}
-
-
-/* --------------------------------------------------------------------------------
-  bitmap  try_find_and_claim
-  (used to allocate abandoned pages)
--------------------------------------------------------------------------------- */
-
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
@@ -1177,7 +1269,7 @@ bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) {
     if (mi_bsr(cmap,&cmap_idx)) {
       // highest chunk
       const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
-      size_t cidx; 
+      size_t cidx;
       if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) {
         *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
         return true;

From 98879ac8bcf545f03ec750c1172a60577f67aa63 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 17:22:00 -0800
Subject: [PATCH 077/264] use thread spacing for reclaim as well

---
 src/bitmap.c | 197 +++++++++++++++++++++++++++------------------------
 1 file changed, 104 insertions(+), 93 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 4f21f68f..0588858d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -42,6 +42,13 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
+// find each set bit in a bit field `x` until it becomes zero.
+static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
+  const bool found = mi_bfield_find_least_bit(*x, idx);
+  *x = mi_bfield_clear_least_bit(*x);
+  return found;
+}
+
 //static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
 //  return mi_rotr(x,r);
 //}
@@ -1080,7 +1087,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 //
 // The start is determined usually as `tseq % cycle` to have each thread
 // start at a different spot.
-// - We use `popcount` to improve branch prediction`
+// - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?)
 // - The `cycle_mask` is the part `[start, cycle>`.
 #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
   mi_assert_internal(start <= cycle); \
@@ -1112,14 +1119,15 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 
 /* --------------------------------------------------------------------------------
-  bitmap  try_find_and_clear
+  mi_bitmap_find
   (used to find free pages)
 -------------------------------------------------------------------------------- */
 
+typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2);
 
-typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
-
-static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear)
+// Go through the bitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2)
 {
   // we space out threads to reduce contention
   const size_t cmap_max_count  = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS);
@@ -1141,17 +1149,9 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz
       mi_assert_internal(eidx <= MI_BFIELD_BITS);
       const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
       mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-      size_t cidx;
-      // if we find a spot in the chunk we are done
-      if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) {
-        *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-        mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
+      if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
         return true;
       }
-      else {
-        /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
-        mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-      }
     }
     mi_bfield_cycle_iterate_end(Y);
   }
@@ -1159,6 +1159,36 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz
   return false;
 }
 
+
+/* --------------------------------------------------------------------------------
+  mi_bitmap_try_find_and_clear -- used to find free pages
+  note: the compiler will fully inline the indirect function calls
+-------------------------------------------------------------------------------- */
+
+
+typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
+
+static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) {
+  MI_UNUSED(arg2);
+  mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear = (mi_bchunk_try_find_and_clear_fun_t*)arg1;
+  size_t cidx;
+  // if we find a spot in the chunk we are done
+  if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) {
+    *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+    mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
+    return true;
+  }
+  else {
+    /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
+    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+    return false;
+  }
+}
+
+static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) {
+  return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL);  
+}
+
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
 }
@@ -1183,80 +1213,55 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_
 
 
 /* --------------------------------------------------------------------------------
-  bitmap  try_find_and_claim
-  (used to allocate abandoned pages)
+  Bitmap: try_find_and_claim  -- used to allocate abandoned pages
+  note: the compiler will fully inline the indirect function call
 -------------------------------------------------------------------------------- */
 
-#define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \
-  { \
-  /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
-  const size_t chunk_max_acc       = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
-  const size_t chunk_start         = tseq % chunk_max_acc; /* space out threads? */ \
-  const size_t chunkmap_max        = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
-  const size_t chunkmap_max_acc    = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
-  const size_t chunkmap_start      = chunk_start / MI_BFIELD_BITS; \
-  /* for each chunkmap entry `i` */ \
-  for (size_t _i = 0; _i < chunkmap_max; _i++) { \
-    size_t i; \
-    if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
-      i = _i + chunkmap_start;  \
-      if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
-    } \
-    else { i = _i;  }  /* the rest of the chunks above chunk_max_accessed */ \
-    const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
-    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
-    /* todo: space out threads within a chunkmap (2GiB) as well? */ \
-    size_t cmap_idx_shift = 0;   /* shift through the cmap */ \
-    size_t cmap_idx; \
-    while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
-      /* set the chunk idx */ \
-      size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
-      /* try to find and clear N bits in that chunk */ \
-      {
+typedef struct mi_claim_fun_data_s {
+  mi_arena_t*   arena;
+  mi_subproc_t* subproc;
+  int           heap_tag;
+} mi_claim_fun_data_t;
 
-#define mi_bitmap_forall_chunks_end() \
-      } \
-      /* skip to the next bit */ \
-      cmap_idx_shift += cmap_idx+1; \
-      cmap >>= cmap_idx;            /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \
-      cmap >>= 1; \
-    } \
-  }}
+static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2)
+{
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1;
+  mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2;
+  size_t cidx;
+  if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+    const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+    mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
+    bool keep_set = true;
+    if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) {
+      // success!
+      mi_assert_internal(!keep_set);
+      *pidx = slice_index;
+      return true;
+    }
+    else {
+      // failed to claim it, set abandoned mapping again (unless the page was freed)
+      if (keep_set) {
+        const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+        mi_assert_internal(wasclear); MI_UNUSED(wasclear);
+      }
+    }
+  }
+  else {
+    // we may find that all are cleared only on a second iteration but that is ok as
+    // the chunkmap is a conservative approximation.
+    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+  }
+  return false;
+}
 
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag )
+  mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag)
 {
-  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
-      const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-      mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
-      bool keep_set = true;
-      if ((*claim)(slice_index, arena, subproc, heap_tag, &keep_set)) {
-        // success!
-        mi_assert_internal(!keep_set);
-        *pidx = slice_index;
-        return true;
-      }
-      else {
-        // failed to claim it, set abandoned mapping again (unless the page was freed)
-        if (keep_set) {
-          const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
-          mi_assert_internal(wasclear); MI_UNUSED(wasclear);
-        }
-      }
-    }
-    else {
-      // we may find that all are cleared only on a second iteration but that is ok as
-      // the chunkmap is a conservative approximation.
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-    }
-  }
-  mi_bitmap_forall_chunks_end();
-  return false;
+  mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag };
+  return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
 }
 
 
@@ -1291,22 +1296,28 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
 
 
 // Visit all set bits in a bitmap.
-// todo: optimize further? maybe popcount to help the branch predictor for the loop,
-// and keep b constant (using a mask)? or avx512 to directly get all indices using a mask_compressstore?
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
 bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
-  mi_bitmap_forall_chunks(bitmap, 0, chunk_idx) {
-    mi_bchunk_t* chunk = &bitmap->chunks[chunk_idx];
-    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
-      const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
-      mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
-      size_t bidx;
-      while (mi_bsf(b, &bidx)) {
-        b = b & (b-1);  // clear low bit
-        const size_t idx = base_idx + bidx;
-        if (!visit(idx, arena, arg)) return false;
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for(size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        size_t bidx;
+        while (mi_bfield_foreach_bit(&b, &bidx)) {
+          const size_t idx = base_idx + bidx;
+          if (!visit(idx, arena, arg)) return false;
+        }
       }
     }
   }
-  mi_bitmap_forall_chunks_end();
   return true;
 }

From df956c4a17cf6fa5e6c0bf73b50079c836ba5a37 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 17:22:41 -0800
Subject: [PATCH 078/264] use thread spacing for reclaim as well

---
 src/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 0588858d..b84b42a4 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1111,7 +1111,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 #define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \
-  const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); \
+  const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\
   mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF)
 
 #define mi_bfield_cycle_iterate_end(SUF) \

From d5c4a16e58d25c54809c65461c3abe9b075a64df Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 17:57:36 -0800
Subject: [PATCH 079/264] lower full page retain more aggressively in a
 threadpool

---
 src/heap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heap.c b/src/heap.c
index 1d8142f7..dee404d2 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -202,7 +202,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
     heap->allow_page_reclaim = false;
     // and halve the full page retain (possibly to 0)
     if (heap->full_page_retain >= 0) {
-      heap->full_page_retain = heap->full_page_retain / 2;
+      heap->full_page_retain = heap->full_page_retain / 4;
     }
   }
   

From 637de624b3a012dbe8417c15bb61112712b71167 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 19:55:45 -0800
Subject: [PATCH 080/264] fix free bug for meta data

---
 src/arena-meta.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index 0fb4dfa5..401231ac 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -145,7 +145,7 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
     // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
-    mi_bitmap_clearN(&mpage->blocks_free, block_idx, block_count);
+    mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL);
   }
   else if (mi_memid_is_os(memid)) {
     _mi_os_free(p, size, memid);    
@@ -154,3 +154,14 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     mi_assert_internal(mi_memid_needs_no_free(memid));
   }
 }
+
+bool _mi_meta_is_meta_page(void* p) 
+{
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    if ((void*)mpage == p) return true;
+    mpage = mi_meta_page_next(mpage);    
+  }
+  return false;
+}
\ No newline at end of file

From 623eaedf336fb4372b4de368c7b9292ea81d5f18 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 19:59:54 -0800
Subject: [PATCH 081/264] add debug output for page map; free tld on thread
 exit

---
 include/mimalloc/internal.h |  1 +
 src/arena.c                 | 45 +++++++++++++++++++++++++++++++------
 src/init.c                  | 13 ++++++-----
 src/page.c                  |  4 ++--
 test/test-stress.c          | 10 +++++----
 5 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ee7f1026..a5ca3e27 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -159,6 +159,7 @@ bool        _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page);
 // arena-meta.c
 void*       _mi_meta_zalloc( size_t size, mi_memid_t* memid );
 void        _mi_meta_free(void* p, size_t size, mi_memid_t memid);
+bool        _mi_meta_is_meta_page(void* p);
 
 // "page-map.c"
 bool        _mi_page_map_init(void);
diff --git a/src/arena.c b/src/arena.c
index 32c0b32e..a61f59b0 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -162,6 +162,8 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
 static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(slice_index < UINT32_MAX);
   mi_assert_internal(slice_count < UINT32_MAX);
+  mi_assert_internal(slice_count > 0);
+  mi_assert_internal(slice_index < arena->slice_count);
   mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
   memid.mem.arena.arena = arena;
   memid.mem.arena.slice_index = (uint32_t)slice_index;
@@ -663,7 +665,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
-  page->memid = memid;
+  page->memid = memid; 
+  mi_assert_internal(memid.mem.arena.slice_count > 0);
   page->free_is_zero = memid.initially_zero;
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)mi_ctz(block_size);
@@ -1197,7 +1200,33 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
+static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t* arena, size_t slice_index) {
+  size_t bit_set_count = 0;
+  long bit_of_page = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) {
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    void* start = mi_arena_slice_start(arena, slice_index + bit);
+    if (is_set) {
+      bit_set_count++;
+      mi_page_t* page = (mi_page_t*)start;
+      char c = 'p';
+      if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
+      else if (mi_page_is_abandoned(page)) { c = 'f'; }      
+      bit_of_page = (long)page->memid.mem.arena.slice_count - 1;
+      buf[bit] = c;
+    }    
+    else {
+      char c = '.';
+      if (bit_of_page > 0) { c = '-'; }
+      else if (_mi_meta_is_meta_page(start)) { c = 'm'; }
+      else if (slice_index + bit < arena->info_slices) { c = 'i'; }
+      buf[bit] = c;
+    }
+  }
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
   _mi_output_message("%s:\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
@@ -1217,7 +1246,8 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
-        size_t xcount = mi_debug_show_bfield(bfield, buf + k);
+        size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf + k, arena, bit_count)
+                                     : mi_debug_show_bfield(bfield, buf + k));
         if (invert) xcount = MI_BFIELD_BITS - xcount;
         bit_set_count += xcount;
         k += MI_BFIELD_BITS;
@@ -1246,15 +1276,16 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
     slice_total += arena->slice_count;
-    _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true);
+      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
-    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false);
+    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
     // todo: abandoned slices
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false);
+      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     }
+    mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena);
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
   // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total);
diff --git a/src/init.c b/src/init.c
index 85588970..5c5186b9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -398,11 +398,10 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   // merge stats
   _mi_stats_done(&heap->tld->stats);
 
-  // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
-  }
-  else {
+  // free heap meta data
+  _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
+
+  if (heap == &_mi_heap_main) {
     #if 0
     // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
@@ -410,6 +409,10 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
     mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
     #endif
   }
+
+  // free the tld
+  mi_tld_t* tld = _mi_tld();
+  _mi_meta_free(_mi_tld(), sizeof(mi_tld_t), tld->memid);
   return false;
 }
 
diff --git a/src/page.c b/src/page.c
index a90c1d7d..a30db6c9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
-
+  
   // const size_t bsize = mi_page_block_size(page);
   // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
@@ -623,7 +623,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   #endif
   mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
-
+  
   // initialize an initial free list
   mi_page_extend_free(heap,page);
   mi_assert(mi_page_immediate_available(page));
diff --git a/test/test-stress.c b/test/test-stress.c
index 0488fc2b..df535e6e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -344,12 +344,14 @@ int main(int argc, char** argv) {
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
   //mi_debug_show_arenas(true, true, false);
+  // mi_debug_show_arenas(true, false, false);
   mi_collect(true);
-  mi_debug_show_arenas(true,true,false);
-  #endif
-  // mi_collect(true);
-  // mi_debug_show_arenas(true, true, false);
+  mi_debug_show_arenas(true, false, false);
+  #else
+  mi_collect(false);
+  mi_debug_show_arenas(true, true, false);
   // mi_stats_print(NULL);
+  #endif
 #else
   mi_stats_print(NULL);  // so we see rss/commit/elapsed
 #endif

From b53ac835f117076b83f985943e1e0b436f54f755 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 20:01:37 -0800
Subject: [PATCH 082/264] comment

---
 src/arena-meta.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index 401231ac..bc98d3f9 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -155,6 +155,7 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
   }
 }
 
+// used for debug output
 bool _mi_meta_is_meta_page(void* p) 
 {
   mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
@@ -164,4 +165,4 @@ bool _mi_meta_is_meta_page(void* p)
     mpage = mi_meta_page_next(mpage);    
   }
   return false;
-}
\ No newline at end of file
+}

From e43eb1f19167b01ec89d91fe05e8e6c2a2d2828b Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 12 Dec 2024 20:22:24 -0800
Subject: [PATCH 083/264] nicer debug output

---
 include/mimalloc.h |  2 +-
 src/arena.c        | 23 +++++++++++++----------
 src/bitmap.c       |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 97f74c83..710e5d67 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef int mi_arena_id_t;
diff --git a/src/arena.c b/src/arena.c
index a61f59b0..c9a61291 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1212,7 +1212,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
       char c = 'p';
       if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
       else if (mi_page_is_abandoned(page)) { c = 'f'; }      
-      bit_of_page = (long)page->memid.mem.arena.slice_count - 1;
+      bit_of_page = (long)page->memid.mem.arena.slice_count;
       buf[bit] = c;
     }    
     else {
@@ -1265,13 +1265,12 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
   return bit_set_count;
 }
 
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  MI_UNUSED(show_abandoned);
+void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept {
   size_t max_arenas = mi_arena_get_count();
   size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
-  size_t purge_total = 0;
+  size_t page_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
@@ -1280,16 +1279,20 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
-    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
-    // todo: abandoned slices
-    if (show_purge) {
-      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
+    if (show_committed) {
+      mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
+    }
+    // todo: abandoned slices
+    //if (show_purge) {
+    //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
+    //}
+    if (show_pages) {
+      page_total += mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena);
     }
-    mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena);
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
   // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total);
-  if (show_purge)     _mi_output_message("total purgeable slices: %zu\n", purge_total);
+  if (show_pages)     _mi_output_message("total pages in areanas: %zu\n", page_total);
 }
 
 
diff --git a/src/bitmap.c b/src/bitmap.c
index b84b42a4..649a7046 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1220,7 +1220,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_
 typedef struct mi_claim_fun_data_s {
   mi_arena_t*   arena;
   mi_subproc_t* subproc;
-  int           heap_tag;
+  mi_heaptag_t  heap_tag;
 } mi_claim_fun_data_t;
 
 static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2)

From 3010d5890f0c305ef9b54dc8a138f81fe496fdac Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Thu, 12 Dec 2024 20:27:46 -0800
Subject: [PATCH 084/264] fix assertion

---
 src/arena.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index c9a61291..5f996b89 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -666,7 +666,6 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
   page->memid = memid; 
-  mi_assert_internal(memid.mem.arena.slice_count > 0);
   page->free_is_zero = memid.initially_zero;
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)mi_ctz(block_size);

From ba39e4d65b87c7a7a5a6bdb09174d35adcb058ed Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 13 Dec 2024 09:03:17 -0800
Subject: [PATCH 085/264] wip: start on purge

---
 include/mimalloc.h |  2 +-
 src/arena.c        | 60 ++++++++++++++++++++++++++++------------------
 test/test-stress.c |  8 +++----
 3 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 710e5d67..24217fae 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef int mi_arena_id_t;
diff --git a/src/arena.c b/src/arena.c
index c9a61291..07239d25 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -37,23 +37,31 @@ typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
   mi_arena_id_t       id;                   // arena id (> 0 where `arena == arenas[arena->id - 1]`)
 
-  size_t              slice_count;          // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
+  size_t              slice_count;          // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
   int                 numa_node;            // associated NUMA node
-  bool                is_exclusive;            // only allow allocations if specifically for this arena
+  bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
-  _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices should be decommitted from `slices_decommit`.
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when pages can be purged from `pages_purge`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
-  mi_bitmap_t*        slices_purge;         // can the slice be purged? (slice in purge => slice in free)
   mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
-  mi_bitmap_t*        pages;                // all registered pages
+  mi_bitmap_t*        pages;                // all registered pages (abandoned and owned)
+  mi_bitmap_t*        pages_purge;          // pages that are scheduled to be purged
   mi_bitmap_t*        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
-  // followed by the bitmaps (whose size depends on the arena size)
+  // followed by the bitmaps (whose sizes depend on the arena size)
 } mi_arena_t;
 
+// Every "page" in `pages_purge` points to purge info 
+// (since we use it for any free'd range and not just for pages)
+typedef struct mi_purge_info_s {
+  mi_msecs_t  expire;
+  size_t      slice_count;
+} mi_purge_info_t;
+
+
 #define MI_MAX_ARENAS         (160)         // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
                                             // 160 arenas is enough for ~2 TiB memory
 
@@ -262,8 +270,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
   if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); }
   mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
-  // mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
-
+  
   return p;
 }
 
@@ -569,7 +576,6 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
       mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
       mi_assert_internal(_mi_ptr_page(page)==page);
       mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
@@ -665,8 +671,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
-  page->memid = memid; 
-  mi_assert_internal(memid.mem.arena.slice_count > 0);
+  page->memid = memid;   
   page->free_is_zero = memid.initially_zero;
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)mi_ctz(block_size);
@@ -771,7 +776,6 @@ void _mi_arena_page_free(mi_page_t* page) {
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
     mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
     // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
@@ -809,7 +813,6 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     mi_assert_internal(!mi_page_is_singleton(page));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
@@ -865,8 +868,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
-
+    
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
@@ -926,8 +928,8 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
     }
 
     // potentially decommit
-    if (!arena->memid.is_pinned && !arena->memid.initially_committed) { // todo: maybe allow decommit even if initially committed?
-      // (delay) purge the entire range
+    if (!arena->memid.is_pinned /* && !arena->memid.initially_committed */) { // todo: allow decommit even if initially committed?
+      // (delay) purge the page
       mi_arena_schedule_purge(arena, slice_index, slice_count);
     }
 
@@ -1121,8 +1123,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->slices_free = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_committed = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
-  arena->slices_purge = mi_arena_bitmap_init(slice_count,&base);
   arena->pages = mi_arena_bitmap_init(slice_count, &base);
+  arena->pages_purge = mi_arena_bitmap_init(slice_count, &base);
   for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
     arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base);
   }
@@ -1207,21 +1209,33 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
     bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
     void* start = mi_arena_slice_start(arena, slice_index + bit);
     if (is_set) {
+      mi_assert_internal(bit_of_page <= 0);
       bit_set_count++;
       mi_page_t* page = (mi_page_t*)start;
       char c = 'p';
       if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
-      else if (mi_page_is_abandoned(page)) { c = 'f'; }      
+      else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); }
       bit_of_page = (long)page->memid.mem.arena.slice_count;
       buf[bit] = c;
     }    
     else {
-      char c = '.';
+      char c = '?';
       if (bit_of_page > 0) { c = '-'; }
       else if (_mi_meta_is_meta_page(start)) { c = 'm'; }
       else if (slice_index + bit < arena->info_slices) { c = 'i'; }
+      // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
+      else if (mi_bitmap_is_setN(arena->slices_free, slice_index+bit, 1)) {
+        if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, 1)) { 
+          mi_assert_internal(bit_of_page <= 0);
+          mi_purge_info_t* pinfo = (mi_purge_info_t*)start;
+          c = '!';
+          bit_of_page = (long)pinfo->slice_count;
+        }
+        if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
+        else { c = '.'; }
+      }
       buf[bit] = c;
-    }
+    }    
   }
   return bit_set_count;
 }
@@ -1265,7 +1279,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
   return bit_set_count;
 }
 
-void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept {
+void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept {
   size_t max_arenas = mi_arena_get_count();
   size_t free_total = 0;
   size_t slice_total = 0;
@@ -1287,7 +1301,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages)
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     //}
     if (show_pages) {
-      page_total += mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena);
+      page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, !:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
diff --git a/test/test-stress.c b/test/test-stress.c
index df535e6e..4fe6e0c6 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 0
+#elif 1
 static int THREADS = 4;
 static int SCALE   = 10;
 static int ITER    = 10;
@@ -345,11 +345,11 @@ int main(int argc, char** argv) {
   #ifndef NDEBUG
   //mi_debug_show_arenas(true, true, false);
   // mi_debug_show_arenas(true, false, false);
-  mi_collect(true);
-  mi_debug_show_arenas(true, false, false);
+  // mi_collect(true);
+  mi_debug_show_arenas(true,false,false);
   #else
   mi_collect(false);
-  mi_debug_show_arenas(true, true, false);
+  mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif
 #else

From 4c81c3cf90135fd2d3e00be19faf3c5fd7d53f71 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 13 Dec 2024 13:17:00 -0800
Subject: [PATCH 086/264] enable purging of free committed slices from arenas

---
 include/mimalloc/types.h |   2 +-
 src/arena.c              | 162 +++++++++++++++++++++++++++++----------
 src/bitmap.c             |  59 +++++++++++++-
 src/bitmap.h             |  23 +++++-
 src/options.c            |  10 ++-
 src/prim/unix/prim.c     |   6 +-
 test/test-stress.c       |  16 ++--
 7 files changed, 222 insertions(+), 56 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index f4bfa07a..bf1cb5c8 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -321,7 +321,7 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 8 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
diff --git a/src/arena.c b/src/arena.c
index 8cf61b74..9f95a699 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,13 +42,13 @@ typedef struct mi_arena_s {
   int                 numa_node;            // associated NUMA node
   bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
-  _Atomic(mi_msecs_t) purge_expire;         // expiration time when pages can be purged from `pages_purge`.
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
   mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t*        slices_purge;         // slices that can be purged
   mi_bitmap_t*        pages;                // all registered pages (abandoned and owned)
-  mi_bitmap_t*        pages_purge;          // pages that are scheduled to be purged
   mi_bitmap_t*        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
   // followed by the bitmaps (whose sizes depend on the arena size)
@@ -57,8 +57,8 @@ typedef struct mi_arena_s {
 // Every "page" in `pages_purge` points to purge info 
 // (since we use it for any free'd range and not just for pages)
 typedef struct mi_purge_info_s {
-  mi_msecs_t  expire;
-  size_t      slice_count;
+  _Atomic(mi_msecs_t)  expire;
+  _Atomic(size_t)      slice_count;
 } mi_purge_info_t;
 
 
@@ -1123,8 +1123,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->slices_free = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_committed = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
+  arena->slices_purge = mi_arena_bitmap_init(slice_count, &base);
   arena->pages = mi_arena_bitmap_init(slice_count, &base);
-  arena->pages_purge = mi_arena_bitmap_init(slice_count, &base);
   for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
     arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base);
   }
@@ -1224,16 +1224,12 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
       else if (_mi_meta_is_meta_page(start)) { c = 'm'; }
       else if (slice_index + bit < arena->info_slices) { c = 'i'; }
       // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
-      else if (mi_bitmap_is_setN(arena->slices_free, slice_index+bit, 1)) {
-        if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, 1)) { 
-          mi_assert_internal(bit_of_page <= 0);
-          mi_purge_info_t* pinfo = (mi_purge_info_t*)start;
-          c = '!';
-          bit_of_page = (long)pinfo->slice_count;
-        }
-        if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
+      else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) {
+        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '!'; }
+        else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
         else { c = '.'; }
       }
+      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }
       buf[bit] = c;
     }    
   }
@@ -1390,53 +1386,121 @@ static long mi_arena_purge_delay(void) {
   return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
 }
 
-// reset or decommit in an arena and update the committed/decommit bitmaps
+// reset or decommit in an arena and update the commit bitmap
 // assumes we own the area (i.e. slices_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices) {
+static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_slices(slices);
+  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+
+  const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
-  bool needs_recommit;
-  if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) {
-    // all slices are committed, we can purge freely
+  bool needs_recommit = false;  // reset needs no recommit, decommit does need it
+  if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) {
+    // all slices are committed, we can purge the entire range
     needs_recommit = _mi_os_purge(p, size);
   }
   else {
-    // some slices are not committed -- this can happen when a partially committed slice is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+    mi_assert_internal(false); // ?
   }
 
-  // clear the purged slices
-  mi_bitmap_clearN(arena->slices_purge, slices, slice_index);
-
   // update committed bitmap
   if (needs_recommit) {
-    mi_bitmap_clearN(arena->slices_committed, slices, slice_index);
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
 }
 
 
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices) {
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   const long delay = mi_arena_purge_delay();
   if (delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
 
+  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
   if (delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, slice_index, slices);
+    // purge directly
+    mi_arena_purge(arena, slice_index, slice_count);
   }
   else {
-    // schedule decommit
-    _mi_error_message(EFAULT, "purging not yet implemented\n");
+    // schedule purge
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+    if (expire == 0) {
+      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+    }
+    //else {
+    //  mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
+    //}
+    mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
   }
 }
 
+typedef struct mi_purge_visit_info_s {
+  mi_msecs_t now;
+  mi_msecs_t delay;
+  bool all_purged;
+  bool any_purged;
+} mi_purge_visit_info_t;
+
+static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) {
+    // purge
+    mi_arena_purge(arena, slice_index, slice_count);
+    // and reset the free range
+    mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg;  
+  // try to purge: first claim the free blocks
+  if (mi_arena_try_purge_range(arena, slice_index, slice_count)) {
+    vinfo->any_purged = true;    
+  }
+  else {
+    // failed to claim the full range, try per slice instead
+    for (size_t i = 0; i < slice_count; i++) {
+      vinfo->any_purged = vinfo->any_purged || mi_arena_try_purge_range(arena, slice_index + i, 1);
+    }
+  }
+  // done: clear the purge bits
+  mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
+  return true; // continue
+}
+  
+
+
+// returns true if anything was purged
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
+{
+  // check pre-conditions
+  if (arena->memid.is_pinned) return false;
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (expire == 0) return false;
+
+  // expired yet?
+  if (!force && expire > now) return false;
+
+  // reset expire (if not already set concurrently)
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+
+  // go through all purge info's
+  // todo: instead of visiting per-bit, we should visit per range of bits
+  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
+  _mi_bitmap_forall_set(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
+
+  // if not fully purged, make sure to purge again in the future
+  if (!vinfo.all_purged) {
+    const long delay = mi_arena_purge_delay();
+    mi_msecs_t expected = 0;
+    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay);
+  }
+  return vinfo.any_purged;
+}
+
 
 static void mi_arenas_try_purge(bool force, bool visit_all) {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
@@ -1444,12 +1508,27 @@ static void mi_arenas_try_purge(bool force, bool visit_all) {
   const size_t max_arena = mi_arena_get_count();
   if (max_arena == 0) return;
 
-  // _mi_error_message(EFAULT, "purging not yet implemented\n");
-  MI_UNUSED(visit_all);
-  MI_UNUSED(force);
+  // allow only one thread to purge at a time
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard)
+  {
+    const mi_msecs_t now = _mi_clock_now();
+    const size_t arena_start = _mi_tld()->tseq % max_arena;
+    size_t max_purge_count = (visit_all ? max_arena : 1);
+    for (size_t _i = 0; _i < max_arena; _i++) {
+      size_t i = _i + arena_start;
+      if (i >= max_arena) { i -= max_arena; }
+      mi_arena_t* arena = mi_arena_from_index(i);
+      if (arena != NULL) {
+        if (mi_arena_try_purge(arena, now, force)) {
+          if (max_purge_count <= 1) break;
+          max_purge_count--;
+        }
+      }
+    }
+  }
 }
 
-
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
   MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
   _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n");
@@ -1460,8 +1539,9 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi
 /* -----------------------------------------------------------
   Unloading and reloading an arena.
 ----------------------------------------------------------- */
-static bool mi_arena_page_register(size_t slice_index, mi_arena_t* arena, void* arg) {
-  MI_UNUSED(arg);
+static bool mi_arena_page_register(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(arg); MI_UNUSED(slice_count);
+  mi_assert_internal(slice_count == 1);
   mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
   _mi_page_map_register(page);
diff --git a/src/bitmap.c b/src/bitmap.c
index 649a7046..88b45a5e 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1051,6 +1051,23 @@ bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, s
   return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset);
 }
 
+// ------- mi_bitmap_try_clearN ---------------------------------------
+
+bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  if (cidx + n > MI_BCHUNK_BITS) return false;
+  bool maybe_all_clear;
+  const bool cleared = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+  if (cleared && maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+  return cleared;
+}
 
 // ------- mi_bitmap_is_xset ---------------------------------------
 
@@ -1071,6 +1088,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 
 
 
+
 /* --------------------------------------------------------------------------------
   Iterate through a bfield
 -------------------------------------------------------------------------------- */
@@ -1144,7 +1162,7 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si
     // and for each chunkmap entry we iterate over its bits to find the chunks
     mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]);
     size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
-    mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y)
+    mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
     {
       mi_assert_internal(eidx <= MI_BFIELD_BITS);
       const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
@@ -1314,10 +1332,47 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a
         size_t bidx;
         while (mi_bfield_foreach_bit(&b, &bidx)) {
           const size_t idx = base_idx + bidx;
-          if (!visit(idx, arena, arg)) return false;
+          if (!visit(idx, 1, arena, arg)) return false;
         }
       }
     }
   }
   return true;
 }
+
+// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible.
+// used by purging to purge larger ranges if possible
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        size_t bshift = 0;
+        size_t bidx;
+        while (mi_bfield_find_least_bit(b, &bidx)) {
+          b >>= bidx;
+          bshift += bidx;
+          const size_t rng = mi_ctz(~b); // all the set bits from bidx
+          mi_assert_internal(rng>=1);
+          const size_t idx = base_idx + bshift + bidx;
+          if (!visit(idx, rng, arena, arg)) return false;
+          // skip rng
+          b >>= rng;
+          bshift += rng;
+        }
+      }
+    }
+  }
+  return true;
+}
+
diff --git a/src/bitmap.h b/src/bitmap.h
index 7fd09f43..72ba69c1 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -171,6 +171,22 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
   return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
 }
 
+static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_setN(bitmap, idx, 1);
+}
+
+static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_clearN(bitmap, idx, 1);
+}
+
+
+bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_try_clearN(bitmap, idx, 1);
+}
+
+
 
 // Specialized versions for common bit sequence sizes
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
@@ -212,9 +228,12 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
 
 
-typedef bool (mi_forall_set_fun_t)(size_t slice_index, mi_arena_t* arena, void* arg2);
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
 
-// Visit all set bits in a bitmap
+// Visit all set bits in a bitmap (`slice_count == 1`)
 bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 
+// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
+bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
 #endif // MI_BITMAP_H
diff --git a/src/options.c b/src/options.c
index 8fcee452..4f1a00b8 100644
--- a/src/options.c
+++ b/src/options.c
@@ -79,8 +79,12 @@ typedef struct mi_option_desc_s {
 #endif
 
 #ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES
+#if defined(__linux__) && !defined(__ANDROID__)
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 1
+#else
 #define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0
 #endif
+#endif
 
 #ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES
 #define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0
@@ -132,7 +136,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { -1,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -141,7 +145,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
   { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
   { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
@@ -192,7 +196,7 @@ void _mi_options_init(void) {
     }
   }
   _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled");
-  #endif  
+  #endif
 }
 
 long _mi_option_get_fast(mi_option_t option) {
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index e1ca3964..eb351f69 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -61,6 +61,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/syscall.h>
 #endif
 
+#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
 
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
@@ -146,7 +147,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     }
     #endif
   }
-  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
   config->has_overcommit = unix_detect_overcommit();
   config->has_partial_free = true;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@@ -361,6 +362,9 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
+  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
+    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
+  }
 
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
diff --git a/test/test-stress.c b/test/test-stress.c
index 4fe6e0c6..126a7601 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,10 +40,10 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 1
+#elif 0
 static int THREADS = 4;
 static int SCALE   = 10;
-static int ITER    = 10;
+static int ITER    = 20;
 #define ALLOW_LARGE false
 #elif 0
 static int THREADS = 32;
@@ -260,8 +260,12 @@ static void test_stress(void) {
     //mi_debug_show_arenas();
     #endif
     #if !defined(NDEBUG) || defined(MI_TSAN)
-    if ((n + 1) % 10 == 0) 
-      { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+    if ((n + 1) % 10 == 0) { 
+      printf("- iterations left: %3d\n", ITER - (n + 1)); 
+      //mi_debug_show_arenas(true, false, false);
+      //mi_collect(true);
+      //mi_debug_show_arenas(true, false, false);
+    }
     #endif
   }
   // clean up
@@ -344,8 +348,8 @@ int main(int argc, char** argv) {
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
   //mi_debug_show_arenas(true, true, false);
-  // mi_debug_show_arenas(true, false, false);
-  // mi_collect(true);
+  mi_debug_show_arenas(true, false, false);
+  mi_collect(true);
   mi_debug_show_arenas(true,false,false);
   #else
   mi_collect(false);

From 216c04f8d91cd433897e5c2e46a4a24554558c5d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 13 Dec 2024 18:39:03 -0800
Subject: [PATCH 087/264] clean up bitmap api

---
 include/mimalloc/types.h |   3 +-
 src/arena.c              |   7 +-
 src/bitmap.c             | 363 ++++++++++++++++++---------------------
 src/bitmap.h             |  54 +++---
 src/init.c               |   3 +-
 src/stats.c              |  11 +-
 test/test-stress.c       |   2 +-
 7 files changed, 202 insertions(+), 241 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index bf1cb5c8..bf91a58a 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -471,13 +471,12 @@ typedef struct mi_stats_s {
   mi_stat_counter_t commit_calls;
   mi_stat_counter_t reset_calls;
   mi_stat_counter_t purge_calls;
+  mi_stat_counter_t arena_purges;
   mi_stat_counter_t page_no_retire;
   mi_stat_counter_t searches;
   mi_stat_counter_t normal_count;
   mi_stat_counter_t huge_count;
   mi_stat_counter_t arena_count;
-  mi_stat_counter_t arena_crossover_count;
-  mi_stat_counter_t arena_rollback_count;
   mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
   mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
diff --git a/src/arena.c b/src/arena.c
index 9f95a699..7aec429e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1225,7 +1225,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
       else if (slice_index + bit < arena->info_slices) { c = 'i'; }
       // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
       else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) {
-        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '!'; }
+        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; }
         else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
         else { c = '.'; }
       }
@@ -1297,7 +1297,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     //}
     if (show_pages) {
-      page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, !:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena);
+      page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
@@ -1470,8 +1470,6 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m
   mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
   return true; // continue
 }
-  
-
 
 // returns true if anything was purged
 static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
@@ -1486,6 +1484,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+  _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
 
   // go through all purge info's
   // todo: instead of visiting per-bit, we should visit per range of bits
diff --git a/src/bitmap.c b/src/bitmap.c
index 88b45a5e..f689ee58 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -34,7 +34,6 @@ static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) {
   return (x & (x-1));
 }
 
-
 // find the least significant bit that is set (i.e. count trailing zero's)
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
@@ -42,17 +41,13 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
-// find each set bit in a bit field `x` until it becomes zero.
+// find each set bit in a bit field `x` and clear it, until it becomes zero.
 static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
   const bool found = mi_bfield_find_least_bit(*x, idx);
   *x = mi_bfield_clear_least_bit(*x);
   return found;
 }
 
-//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-//  return mi_rotr(x,r);
-//}
-
 static inline mi_bfield_t mi_bfield_zero(void) {
   return 0;
 }
@@ -65,6 +60,7 @@ static inline mi_bfield_t mi_bfield_all_set(void) {
   return ~((mi_bfield_t)0);
 }
 
+// mask of `bit_count` bits set shifted to the left by `shiftl`
 static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
   mi_assert_internal(bit_count > 0);
   mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
@@ -72,7 +68,10 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
   return (mask0 << shiftl);
 }
 
+
 // ------- mi_bfield_atomic_set ---------------------------------------
+// the `_set` functions return also the count of bits that were already set (for commit statistics)
+// the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map)
 
 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
@@ -93,7 +92,8 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bo
 }
 
 // Clear a bit but only when/once it is set. This is used by concurrent free's while
-// the page is abandoned and mapped.
+// the page is abandoned and mapped. This can incure a busy wait :-( but it should
+// happen almost never (and is accounted for in the stats)
 static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
@@ -111,62 +111,39 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
   mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
 }
 
-
 // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly).
 static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { };  // try to atomically set the mask bits until success
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) {};  // try to atomically set the mask bits until success
   if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
   return ((old&mask) == 0);
 }
 
 // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
-static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_clear) {
+static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
-  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { };  // try to atomically clear the mask bits until success
-  if (already_clear!=NULL) { *already_clear = mi_bfield_popcount(~(old&mask)); }
-  return ((old&mask) == mask);
-}
-
-// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    return mi_bfield_atomic_set_mask(b, mask, already_xset);
-  }
-  else {
-    return mi_bfield_atomic_clear_mask(b, mask, already_xset);
-  }
-}
-
-static inline bool mi_bfield_atomic_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_xset_mask(MI_BIT_SET, b, mask, NULL);
-}
-
-static inline bool mi_bfield_atomic_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
   while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {};  // try to atomically clear the mask bits until success
-  if (all_clear!=NULL) { *all_clear = ((old&~mask)==0); }
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
   return ((old&mask) == mask);
 }
 
-static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b) {
+static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already_set) {
   const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set());
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old); }
   return (old==0);
 }
 
-static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b) {
+static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
   const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
+  if (all_clear!=NULL) { *all_clear = true; }
   return (~old==0);
 }
 
-// ------- mi_bfield_atomic_try_xset ---------------------------------------
+// ------- mi_bfield_atomic_try_set/clear ---------------------------------------
 
 
 // Tries to  set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
@@ -197,43 +174,33 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf
 }
 
 
-// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+// Tries to set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0)
+// and `false` otherwise leaving the bfield `b` as-is.
 // `all_clear` is set to true if the new bfield is zero (and false otherwise)
 static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
-  return mi_bfield_atomic_try_clear_mask(b,mask,all_clear);
-  // const mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
-  // if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
-  // return ((old&mask) == mask);
-}
-
-// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
-// and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear ) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    if (all_clear != NULL) { *all_clear = false; }
-    return mi_bfield_atomic_try_set_mask(b, mask);
-  }
-  else {
-    return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
-  }
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
 
 
 // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
-static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  mi_assert_internal((idx%8)==0);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
-static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
+static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
   mi_bfield_t old = mi_bfield_all_set();
-  return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero());
+  if (mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero())) {
+    if (all_clear != NULL) { *all_clear = true; }
+    return true;
+  }
+  else return false;
 }
 
 
@@ -252,16 +219,11 @@ static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfi
   return ((*b & mask) == 0);
 }
 
-
 // Check if all bits corresponding to a mask are set/cleared.
 static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
-  if (set) {
-    return mi_bfield_atomic_is_set_mask(b, mask);
-  }
-  else {
-    return mi_bfield_atomic_is_clear_mask(b, mask);
-  }
+  if (set) return mi_bfield_atomic_is_set_mask(b, mask);
+      else return mi_bfield_atomic_is_clear_mask(b, mask);
 }
 
 
@@ -270,7 +232,7 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfiel
  bitmap chunks
 -------------------------------------------------------------------------------- */
 
-// ------- mi_bchunk_xset ---------------------------------------
+// ------- mi_bchunk_set ---------------------------------------
 
 static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
@@ -279,45 +241,29 @@ static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
   return mi_bfield_atomic_set(&chunk->bfields[i], idx);
 }
 
-static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);  
   const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear);
+  const size_t idx = cidx % MI_BFIELD_BITS; 
+  const mi_bfield_t mask = mi_bfield_mask(n, idx);
+  return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set);
 }
 
-static inline bool mi_bchunk_set8(mi_bchunk_t* chunk, size_t byte_idx) {
-  mi_assert_internal(byte_idx < MI_BCHUNK_SIZE);
-  const size_t i    = byte_idx / MI_BFIELD_SIZE;
-  const size_t bidx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_set8(&chunk->bfields[i], bidx);
+static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);  
+  const size_t i = cidx / MI_BFIELD_BITS;
+  return mi_bfield_atomic_setX(&chunk->bfields[i], already_set);
 }
 
-static inline bool mi_bchunk_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) {
-  mi_assert_internal(byte_idx < MI_BCHUNK_SIZE);
-  const size_t i = byte_idx / MI_BFIELD_SIZE;
-  const size_t bidx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_clear8(&chunk->bfields[i], bidx, maybe_all_clear);
-}
-
-static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t field_idx) {
-  mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
-  return mi_bfield_atomic_setX(&chunk->bfields[field_idx]);
-}
-
-static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t field_idx, bool* maybe_all_clear) {
-  mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
-  if (maybe_all_clear != NULL) { *maybe_all_clear = true; }
-  return mi_bfield_atomic_clearX(&chunk->bfields[field_idx]);
-}
-
-// Set/clear a sequence of `n` bits within a chunk.
+// Set a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
-static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_xset) {
+mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) {
   mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   bool all_transition = true;
-  size_t total_already_xset = 0;
+  bool maybe_all_clear = true;
+  size_t total_already_set = 0;
   size_t idx   = cidx % MI_BFIELD_BITS;
   size_t field = cidx / MI_BFIELD_BITS;
   while (n > 0) {
@@ -326,28 +272,67 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size
     mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     mi_assert_internal(field < MI_BCHUNK_FIELDS);
     const mi_bfield_t mask = mi_bfield_mask(m, idx);
-    size_t already_xset = 0;
-    const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
-    mi_assert_internal((transition && already_xset == 0) || (!transition && already_xset > 0));
+    size_t already_set = 0;
+    bool all_clear = false;
+    const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set)
+                                 : mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear));
+    mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0));
     all_transition = all_transition && transition;
-    total_already_xset += already_xset;
+    total_already_set += already_set;
+    maybe_all_clear = maybe_all_clear && all_clear;
     // next field
     field++;
     idx = 0;
     n -= m;
   }
-  if (palready_xset!=NULL) { *palready_xset = total_already_xset; }
+  if (palready_set!=NULL) { *palready_set = total_already_set; }
+  if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; }
   return all_transition;
 }
 
 static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
-  return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set);
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) {
+    bool was_clear = mi_bchunk_set(chunk, cidx);
+    if (already_set != NULL) { *already_set = !was_clear; }
+    return was_clear;
+  }
+  if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
+  if (n <MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
+  return mi_bchunk_xsetN_(MI_BIT_SET, chunk, cidx, n, already_set, NULL);
 }
 
-static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) {
-  return mi_bchunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear);
+// ------- mi_bchunk_clear ---------------------------------------
+
+static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_clear(&chunk->bfields[i], idx, all_clear);
 }
 
+static inline bool mi_bchunk_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  const mi_bfield_t mask = mi_bfield_mask(n, idx);
+  return mi_bfield_atomic_clear_mask(&chunk->bfields[i], mask, all_clear);
+}
+
+static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  return mi_bfield_atomic_clearX(&chunk->bfields[i], all_clear);
+}
+
+static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear);
+  if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear);
+  if (n <MI_BFIELD_BITS) return mi_bchunk_clearNX(chunk, cidx, n, maybe_all_clear);
+  return mi_bchunk_xsetN_(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);
+}
 
 
 // ------- mi_bchunk_is_xset ---------------------------------------
@@ -378,23 +363,42 @@ static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t
   mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   if (n==0) return true;
-  size_t field = cidx / MI_BFIELD_BITS;
-  size_t idx = cidx % MI_BFIELD_BITS;
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
   if mi_likely(n<=MI_BFIELD_BITS) {
-    return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mi_bfield_mask(n, idx));
+    return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx));
   }
   else {
-    return mi_bchunk_is_xsetN_(set, chunk, field, idx, n);
+    return mi_bchunk_is_xsetN_(set, chunk, i, idx, n);
   }
 }
 
 
-// ------- mi_bchunk_try_xset ---------------------------------------
+// ------- mi_bchunk_try_clear  ---------------------------------------
+
+static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n <= MI_BFIELD_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS; 
+  mi_assert_internal(idx + n <= MI_BFIELD_BITS);
+  const size_t mask = mi_bfield_mask(n, idx);  
+  return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear);
+}
+
+static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
+}
 
 // Try to atomically set/clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
 // and false otherwise leaving all bit fields as is.
-static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+// Note: this is a hard one as we need to unwind partial atomic operations
+// if we fail halfway..
+mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
   mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
   if (n==0) return true;
@@ -414,7 +418,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx,
   mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
   mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
   const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
-  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start, &field_is_clear)) return false;
+  if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &field_is_clear)) return false;
   maybe_all_clear = maybe_all_clear && field_is_clear;
 
   // done?
@@ -431,7 +435,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx,
     field++;
     mi_assert_internal(field < MI_BCHUNK_FIELDS);
     mask_mid = mi_bfield_all_set();
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid, &field_is_clear)) goto restore;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_mid, &field_is_clear)) goto restore;
     maybe_all_clear = maybe_all_clear && field_is_clear;
     n -= MI_BFIELD_BITS;
   }
@@ -443,7 +447,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx,
     mi_assert_internal(field < MI_BCHUNK_FIELDS);
     end_field = field;
     mask_end = mi_bfield_mask(n, 0);
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
     maybe_all_clear = maybe_all_clear && field_is_clear;
   }
 
@@ -456,17 +460,17 @@ restore:
   while( field > start_field) {
     field--;
     const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
-    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, NULL);
+    mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, NULL);
   }
   return false;
 }
 
-// static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
-//   return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL);
-// }
 
 static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
-  return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear);
+  mi_assert_internal(n>0);
+  if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
+  if (n<MI_BFIELD_BITS)  return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
+  return mi_bchunk_try_clearN_(chunk, cidx, n, maybe_all_clear);
 }
 
 
@@ -582,9 +586,8 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
   size_t idx;
   if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
     mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-    mi_assert_internal((idx%8)==0);
-    const size_t byte_idx = idx/8;
-    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // unset the byte atomically
+    mi_assert_internal((idx%8)==0);    
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // unset the byte atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
@@ -614,9 +617,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     if (mask==0) return false;
     const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
     const size_t chunk_idx = bidx / 8;
-    const size_t byte_idx  = bidx % 8;             // byte index of the byte in the bfield
+    const size_t idx  = (bidx % 8)*8;              
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // clear it atomically
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
@@ -672,7 +675,7 @@ static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
 #else
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
-    if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i])) {
+    if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i], NULL)) {
       *pidx = i*MI_BFIELD_BITS;
       mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
       return true;
@@ -691,7 +694,7 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
 // and try to clear them atomically.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
 // (We do not cross bfield boundaries)
-static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
   for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -955,69 +958,31 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 // ------- mi_bitmap_xset ---------------------------------------
 
 // Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BCHUNK_BITS;
   const size_t cidx = idx % MI_BCHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (set) {
-    const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
-    return wasclear;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return wasset;
-  }
+  const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+  mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+  return wasclear;
 }
 
-// Set/clear aligned 8-bits in the bitmap (with `(idx%8)==0`).
-// Returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bitmap_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  mi_assert_internal((idx%8)==0);
   const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (set) {
-    const bool wasclear = mi_bchunk_set8(&bitmap->chunks[chunk_idx], byte_idx);
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
-    return wasclear;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool wasset = mi_bchunk_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return wasset;
-  }
+  bool maybe_all_clear;
+  const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
+  if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+  return wasset;  
 }
 
-// Set/clear a field of bits.
-// Returns `true` if atomically transitioned from 0 to ~0 (or ~0 to 0)
-static bool mi_bitmap_xsetX(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  mi_assert_internal((idx%MI_BFIELD_BITS)==0);
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t field_idx = (idx % MI_BCHUNK_BITS)/MI_BFIELD_BITS;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (set) {
-    const bool wasclear = mi_bchunk_setX(&bitmap->chunks[chunk_idx],field_idx);
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
-    return wasclear;
-  }
-  else {
-    bool maybe_all_clear;
-    const bool wasset = mi_bchunk_clearX(&bitmap->chunks[chunk_idx], field_idx, &maybe_all_clear);
-    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return wasset;
-  }
-}
 
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) {
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
   mi_assert_internal(n>0);
   mi_assert_internal(n<=MI_BCHUNK_BITS);
 
@@ -1027,30 +992,30 @@ static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, siz
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
 
-  if (set) {
-    const bool allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset);
-    mi_bitmap_chunkmap_set(bitmap,chunk_idx);   // set afterwards
-    return allclear;
-  }
-  else {
-    size_t already_clear = 0;
-    const bool allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear );
-    if (already_xset != NULL) { *already_xset = already_clear; }
-    if (already_clear < n) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-    return allset;
-  }
+  const bool were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_set);
+  mi_bitmap_chunkmap_set(bitmap, chunk_idx);   // set afterwards
+  return were_allclear;
 }
 
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's.
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) {
-  mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS);
-  if (n==1) return mi_bitmap_xset(set, bitmap, idx);
-  if (n==8) return mi_bitmap_xset8(set, bitmap, idx);
-  if (n==MI_BFIELD_BITS) return mi_bitmap_xsetX(set, bitmap, idx);
-  return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset);
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
+
+  bool maybe_all_clear;
+  const bool were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+  if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+  return were_allset;
 }
 
+
 // ------- mi_bitmap_try_clearN ---------------------------------------
 
 bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
diff --git a/src/bitmap.h b/src/bitmap.h
index 72ba69c1..4afcdaf1 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -13,7 +13,7 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #define MI_BITMAP_H
 
 /* --------------------------------------------------------------------------------
-  Atomic bitmaps:
+  Atomic bitmaps with release/acquire guarantees:
 
   `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
       each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
@@ -25,19 +25,25 @@ Concurrent bitmap that can set/reset sequences of bits atomically
       These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
       to scan for bits (perhaps) more efficiently.
 
-   `mi_bchunkmap_t` == `mi_bchunk_t`: for each chunk we track if it has (potentially) any bit set.
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
+      ranges aligned to a bfield.
+
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).
+
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
       The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
       This is used to avoid scanning every chunk. (and thus strictly an optimization)
-      It is conservative: it is fine to a bit in the chunk map even if the chunk turns out
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
       to have no bits set. It is also allowed to briefly have a clear bit even if the
-      chunk has bits set, as long as we guarantee that we set the bit later on -- this
-      allows us to set the chunkmap bit after we set a bit in the corresponding chunk.
+      chunk has bits set -- as long as we guarantee that the bit will be set later on; 
+      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).
 
       However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
       cannot safely clear the bit corresponding to the chunk in the chunkmap since it
       may race with another thread setting a bit in the same chunk. Therefore, when
       clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
-      then test again to catch any set bits that we missed.
+      then test again to catch any set bits that we may have missed.
 
       Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
       not find a free page even though it's there (but we accept this as we avoid taking
@@ -130,32 +136,22 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 // Not atomic so only use if still local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
 
-// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
-
-static inline bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_xset(MI_BIT_SET, bitmap, idx);
-}
-
-static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_xset(MI_BIT_CLEAR, bitmap, idx);
-}
+// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
 
 
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-// If `already_xset` is not NULL, it is set to count of bits were already all set/cleared.
+// If `already_set` is not NULL, it is set to count of bits were already all set.
 // (this is used for correct statistics if commiting over a partially committed area)
-bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset);
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
 
-static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
-  return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set);
-}
-
-static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  return mi_bitmap_xsetN(MI_BIT_CLEAR, bitmap, idx, n, NULL);
-}
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
 
 // Is a sequence of n bits already all set/cleared?
@@ -167,6 +163,7 @@ static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n)
   return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
 }
 
+// Is a sequence of n bits already clear?
 static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
 }
@@ -180,8 +177,11 @@ static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
 }
 
 
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
 bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+// Try to atomically transition a bit from set to clear. Returns `true` on succes.
 static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
   return mi_bitmap_try_clearN(bitmap, idx, 1);
 }
@@ -223,7 +223,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 
 
-// If a bit is set in the bitmap, return `true` and set `idx` to its index.
+// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
 // Otherwise return `false` (and `*idx` is undefined).
 bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
 
diff --git a/src/init.c b/src/init.c
index 5c5186b9..8f1449a3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -84,8 +84,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }  \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/stats.c b/src/stats.c
index 2a793b59..860a69ef 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -338,12 +338,11 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
   mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
-  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
-  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
-  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
-  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+  mi_stat_counter_print(&stats->arena_purges, "-purges", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmap calls", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, " -commit", out, arg);
+  mi_stat_counter_print(&stats->reset_calls, "-reset", out, arg);
+  mi_stat_counter_print(&stats->purge_calls, "-purge", out, arg);
   mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
diff --git a/test/test-stress.c b/test/test-stress.c
index 126a7601..1996e52e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -352,7 +352,7 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,false,false);
   #else
-  mi_collect(false);
+  //mi_collect(true);
   mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif

From b5dfd233e943855a381b7c36750fc20e54e154bc Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 13 Dec 2024 19:59:08 -0800
Subject: [PATCH 088/264] fix avx2 bug with atomics

---
 CMakeLists.txt     |  4 +--
 src/bitmap.c       | 63 +++++++++++++++++++---------------------------
 test/test-stress.c |  2 +-
 3 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa35d749..344b72a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,8 +117,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
   if (NOT MI_OPT_ARCH)
     message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
   endif()
-else()
-  set(MI_OPT_ARCH OFF)
+#else()
+#  set(MI_OPT_ARCH OFF)
 endif()
 
 if(MI_OVERRIDE)
diff --git a/src/bitmap.c b/src/bitmap.c
index f689ee58..d8e207e3 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -143,20 +143,9 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_cle
   return (~old==0);
 }
 
-// ------- mi_bfield_atomic_try_set/clear ---------------------------------------
+// ------- mi_bfield_atomic_try_clear ---------------------------------------
 
 
-// Tries to  set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
-// and false otherwise (leaving the bit field as is).
-static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
-  mi_assert_internal(mask != 0);
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
-  do {
-    if ((old&mask) != 0) return false; // the mask bits are no longer 0
-  } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
-  return true;
-}
-
 // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
 // and false otherwise (leaving the bit field as is).
 static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
@@ -242,16 +231,16 @@ static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
 }
 
 static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);  
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS; 
+  const size_t idx = cidx % MI_BFIELD_BITS;
   const mi_bfield_t mask = mi_bfield_mask(n, idx);
   return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set);
 }
 
 static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);  
+  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);
   const size_t i = cidx / MI_BFIELD_BITS;
   return mi_bfield_atomic_setX(&chunk->bfields[i], already_set);
 }
@@ -380,9 +369,9 @@ static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
   mi_assert_internal(n <= MI_BFIELD_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS; 
+  const size_t idx = cidx % MI_BFIELD_BITS;
   mi_assert_internal(idx + n <= MI_BFIELD_BITS);
-  const size_t mask = mi_bfield_mask(n, idx);  
+  const size_t mask = mi_bfield_mask(n, idx);
   return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear);
 }
 
@@ -493,12 +482,14 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 
 static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
   mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-  size_t cidx;
+  // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
+  // as the compiler won't reload the registers vec1 and vec2 from memory again.
+  const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); 
+  size_t idx;
   if (!allow_allset && (~b == 0)) return false;
-  if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
-    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
-      *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+  if (mi_bfield_find_least_bit(b, &idx)) {           // find the least bit
+    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx < MI_BCHUNK_BITS);
       return true;
     }
@@ -522,6 +513,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
     if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
     // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
   #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
@@ -555,7 +547,8 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     chunk_idx = mi_ctz(mask) / 8;
     #endif
     if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
-    // try again
+    // try again    
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
   #else
   // try first to find a field that is not all set (to reduce fragmentation)
@@ -586,7 +579,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
   size_t idx;
   if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
     mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-    mi_assert_internal((idx%8)==0);    
+    mi_assert_internal((idx%8)==0);
     if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // unset the byte atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
@@ -617,10 +610,10 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     if (mask==0) return false;
     const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
     const size_t chunk_idx = bidx / 8;
-    const size_t idx  = (bidx % 8)*8;              
+    const size_t idx  = (bidx % 8)*8;
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
     if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
-      *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
     }
@@ -665,7 +658,7 @@ static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     const size_t chunk_idx = _tzcnt_u64(mask) / 8;
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) {
+    if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx],NULL)) {
       *pidx = chunk_idx*MI_BFIELD_BITS;
       mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
       return true;
@@ -804,13 +797,6 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
 
 // ------- mi_bitmap_all_are_clear ---------------------------------------
 
-// are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads)
-static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
-  for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
-  }
-  return true;
-}
 
 // are all bits in a bitmap chunk clear?
 static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
@@ -823,7 +809,10 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
   return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
   #else
-  return mi_bchunk_all_are_clear(chunk);
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
+  return true;
   #endif
 }
 
@@ -976,7 +965,7 @@ bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
   bool maybe_all_clear;
   const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
   if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-  return wasset;  
+  return wasset;
 }
 
 
@@ -1169,7 +1158,7 @@ static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk
 }
 
 static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) {
-  return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL);  
+  return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL);
 }
 
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
diff --git a/test/test-stress.c b/test/test-stress.c
index 1996e52e..277f9e6e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -353,7 +353,7 @@ int main(int argc, char** argv) {
   mi_debug_show_arenas(true,false,false);
   #else
   //mi_collect(true);
-  mi_debug_show_arenas(true,false,false);
+  //mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif
 #else

From 4aeb2e1005c41114844175a27985df483120daff Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 13:21:13 -0800
Subject: [PATCH 089/264] flexible clearN_ that can start at any index

---
 src/bitmap.c | 95 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 31 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index d8e207e3..b7b228c1 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -26,6 +26,10 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) {
   return mi_ctz(x);
 }
 
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
+
 static inline size_t mi_bfield_popcount(mi_bfield_t x) {
   return mi_popcount(x);
 }
@@ -41,6 +45,15 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
+
+// find the most significant bit that is set.
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsr(x, idx);
+}
+
+
 // find each set bit in a bit field `x` and clear it, until it becomes zero.
 static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
   const bool found = mi_bfield_find_least_bit(*x, idx);
@@ -598,9 +611,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
   #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
-    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vec2  = _mm256_load_si256((const __m256i*)chunk->bfields+1);
-    const __m256i cmpv  = mi_mm256_ones();
+    const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
+    const __m256i cmpv = mi_mm256_ones();
     const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
     const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
     const uint32_t mask1 = _mm256_movemask_epi8(vcmp1);    // mask of most significant bit of each byte
@@ -610,7 +623,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     if (mask==0) return false;
     const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
     const size_t chunk_idx = bidx / 8;
-    const size_t idx  = (bidx % 8)*8;
+    const size_t idx = (bidx % 8)*8;
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
     if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
@@ -618,6 +631,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
       return true;
     }
     // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded  }
   }
   #else
     // first skip allset fields to reduce fragmentation
@@ -664,6 +678,7 @@ static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
       return true;
     }
     // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
 #else
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -684,7 +699,8 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
 }
 
 // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
-// and try to clear them atomically.
+// and try to clear them atomically. 
+// Currently does not cross bfield boundaries.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
 // (We do not cross bfield boundaries)
 mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
@@ -732,35 +748,51 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
 static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
 
-  // we align at a bfield, and scan `field_count` fields
-  // n >= MI_BFIELD_BITS; find a first field that is 0
-  const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS);  // we need this many fields
-  for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
+  const size_t skip_count = n/MI_BFIELD_BITS;  
+  size_t cidx;
+  for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++)
   {
-    // first pre-scan for a range of fields that are all set (up to the last one)
-    bool allset = true;
-    size_t j = 0;
-    size_t m = n;
-    do {
-      mi_assert_internal(i + j < MI_BCHUNK_FIELDS);
-      mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
-      size_t idx;
-      if (mi_bfield_find_least_bit(~b,&idx)) {
-        if (m > idx) {
-          allset = false;
-          i += j;  // no need to look again at the previous fields
-          break;
-        }
+    size_t j = 1;   // field count from i
+    size_t m = n;   // bits to go
+
+    // first field
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t ones = mi_bfield_clz(~b);
+    cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones);  // start index
+    if (ones >= m) {
+      // we found enough bits!
+      m = 0;      
+    }
+    else {
+      m -= ones;
+      mi_assert_internal(m>0);
+    }
+
+    // keep scanning further fields?
+    while (i+j < MI_BCHUNK_FIELDS) {
+      mi_assert_internal(m > 0);
+      b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
+      ones = mi_bfield_ctz(~b);
+      if (ones >= m) {
+        // we found enough bits
+        m = 0;
+        break;
+      }
+      else if (ones == MI_BFIELD_BITS) {
+        // not enough yet, proceed to the next field
+        j++;
+        m -= MI_BFIELD_BITS;
       }
       else {
-        // all bits in b were set
-        m -= MI_BFIELD_BITS;  // note: can underflow
+        // the range was not enough, start from scratch
+        i = i + j - 1;  // no need to re-scan previous fields, except the last one (with clz this time)
+        mi_assert_internal(m>0);
+        break;
       }
-    } while (++j < field_count);
-
-    // if all set, we can try to atomically clear them
-    if (allset) {
-      const size_t cidx = i*MI_BFIELD_BITS;
+    }
+    
+    // did we find a range? 
+    if (m==0) {
       if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
         // we cleared all atomically
         *pidx = cidx;
@@ -768,8 +800,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
         mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
         return true;
       }
+      // note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented)
     }
-    // continue
+    // otherwise continue searching
   }
   return false;
 }

From 13ee94cef6900539ad5f4abf322efdfc3650cfc9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 13:22:00 -0800
Subject: [PATCH 090/264] fix concurrent mi_tld access bug

---
 src/arena-meta.c |  4 ++--
 src/arena.c      |  1 +
 src/init.c       | 28 ++++++++++++++++++----------
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index bc98d3f9..ceda06ba 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -93,7 +93,7 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
 
 
 // allocate meta-data
-void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
 {
   mi_assert_internal(pmemid != NULL);
   size = _mi_align_up(size,MI_META_BLOCK_SIZE);
@@ -133,7 +133,7 @@ void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
 }
 
 // free meta-data
-void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
   if (memid.memkind == MI_MEM_META) {
     mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
diff --git a/src/arena.c b/src/arena.c
index 7aec429e..d8b882d3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -551,6 +551,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
 
   // any abandoned in our size class?
   mi_subproc_t* const subproc = tld->subproc;
+  mi_assert_internal(subproc != NULL);
   if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL;
 
   // search arena's
diff --git a/src/init.c b/src/init.c
index 8f1449a3..c103f521 100644
--- a/src/init.c
+++ b/src/init.c
@@ -134,7 +134,8 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default;
 
 static mi_decl_cache_align mi_tld_t tld_main = {
   0,
-  &_mi_heap_main, &_mi_heap_main,
+  &_mi_heap_main,         // heap_backing
+  &_mi_heap_main,         // heaps list
   &mi_subproc_default,    // subproc
   0,                      // tseq
   MI_MEMID_STATIC,        // memid
@@ -271,10 +272,23 @@ static mi_tld_t* mi_tld_alloc(void) {
   }
 }
 
-mi_tld_t* _mi_tld(void) {
+#define MI_TLD_INVALID  ((mi_tld_t*)1)
+
+static mi_decl_noinline void mi_tld_free(void) {
+  mi_tld_t* tld = _mi_tld();
+  mi_tld = MI_TLD_INVALID;
+  _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+}
+
+mi_tld_t* mi_decl_noinline _mi_tld(void) {
+  if (mi_tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n");
+    abort();
+    mi_tld = NULL;
+  }
   if (mi_tld==NULL) {
     mi_tld = mi_tld_alloc();
-  }
+  }  
   return mi_tld;
 }
 
@@ -409,9 +423,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
     #endif
   }
 
-  // free the tld
-  mi_tld_t* tld = _mi_tld();
-  _mi_meta_free(_mi_tld(), sizeof(mi_tld_t), tld->memid);
   return false;
 }
 
@@ -497,10 +508,7 @@ void _mi_thread_done(mi_heap_t* heap)
   _mi_thread_heap_done(heap);  // returns true if already ran
 
   // free thread local data
-  if (mi_tld != NULL) {
-    _mi_meta_free(mi_tld, sizeof(mi_tld_t), mi_tld->memid);
-    mi_tld = NULL;
-  }
+  mi_tld_free();
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {

From 3153e5a4c5136006a8ea9a8500a578d06f486170 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 13:47:33 -0800
Subject: [PATCH 091/264] small fixes

---
 src/bitmap.c | 58 +++++++++++++++++++++-------------------------------
 src/init.c   |  5 ++---
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index b7b228c1..2734e2b2 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -46,14 +46,6 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
 }
 
 
-// find the most significant bit that is set.
-// return false if `x==0` (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
-  return mi_bsr(x, idx);
-}
-
-
 // find each set bit in a bit field `x` and clear it, until it becomes zero.
 static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
   const bool found = mi_bfield_find_least_bit(*x, idx);
@@ -497,7 +489,7 @@ static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t ch
   mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
   // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
   // as the compiler won't reload the registers vec1 and vec2 from memory again.
-  const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); 
+  const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
   size_t idx;
   if (!allow_allset && (~b == 0)) return false;
   if (mi_bfield_find_least_bit(b, &idx)) {           // find the least bit
@@ -560,7 +552,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     chunk_idx = mi_ctz(mask) / 8;
     #endif
     if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
-    // try again    
+    // try again
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
   #else
@@ -699,42 +691,38 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
 }
 
 // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
-// and try to clear them atomically. 
+// and try to clear them atomically.
 // Currently does not cross bfield boundaries.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
 // (We do not cross bfield boundaries)
 mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
-  for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
-    size_t bshift = 0;
     size_t idx;
     while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      b >>= idx;
-      bshift += idx;
-      if (bshift + n > MI_BFIELD_BITS) break;
+      if (idx + n > MI_BFIELD_BITS) break;
 
-      if ((b&mask) == mask) { // found a match
-        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
-        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<<bshift,NULL)) {
-          *pidx = (i*MI_BFIELD_BITS) + bshift;
+      const size_t bmask = mask<<idx;
+      mi_assert_internal(bmask>>idx == mask);
+      if ((b&bmask) == bmask) { // found a match        
+        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) {
+          *pidx = (i*MI_BFIELD_BITS) + idx;
           mi_assert_internal(*pidx < MI_BCHUNK_BITS);
           mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
           return true;
         }
         else {
           // if failed to atomically commit, reload b and try again from this position
-          bshift -= idx;
-          b = mi_atomic_load_relaxed(&chunk->bfields[i]) >> bshift;
+          b = mi_atomic_load_acquire(&chunk->bfields[i]);
         }
       }
       else {
         // advance
-        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
+        const size_t ones = mi_bfield_ctz(~(b>>idx));  // skip all ones (since it didn't fit the mask)
         mi_assert_internal(ones>0);
-        b >>= ones;
-        bshift += ones;
+        b = b & ~mi_bfield_mask(ones, idx);            // clear the ones
       }
     }
   }
@@ -748,11 +736,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
 static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
 
-  const size_t skip_count = n/MI_BFIELD_BITS;  
+  const size_t skip_count = n/MI_BFIELD_BITS;
   size_t cidx;
   for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++)
   {
-    size_t j = 1;   // field count from i
     size_t m = n;   // bits to go
 
     // first field
@@ -761,7 +748,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
     cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones);  // start index
     if (ones >= m) {
       // we found enough bits!
-      m = 0;      
+      m = 0;
     }
     else {
       m -= ones;
@@ -769,6 +756,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
     }
 
     // keep scanning further fields?
+    size_t j = 1;   // field count from i
     while (i+j < MI_BCHUNK_FIELDS) {
       mi_assert_internal(m > 0);
       b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
@@ -790,8 +778,8 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
         break;
       }
     }
-    
-    // did we find a range? 
+
+    // did we find a range?
     if (m==0) {
       if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
         // we cleared all atomically
@@ -1194,24 +1182,24 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz
   return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL);
 }
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
 }
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
 }
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
 }
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
   mi_assert_internal(n<=MI_BFIELD_BITS);
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
 }
 
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
   mi_assert_internal(n<=MI_BCHUNK_BITS);
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_);
 }
diff --git a/src/init.c b/src/init.c
index c103f521..9a26d56f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -274,16 +274,15 @@ static mi_tld_t* mi_tld_alloc(void) {
 
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
-static mi_decl_noinline void mi_tld_free(void) {
+mi_decl_noinline static void mi_tld_free(void) {
   mi_tld_t* tld = _mi_tld();
   mi_tld = MI_TLD_INVALID;
   _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
 }
 
-mi_tld_t* mi_decl_noinline _mi_tld(void) {
+mi_decl_noinline mi_tld_t* _mi_tld(void) {
   if (mi_tld == MI_TLD_INVALID) {
     _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n");
-    abort();
     mi_tld = NULL;
   }
   if (mi_tld==NULL) {

From df9009a06051ba763cae6700d49c7e5318934940 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 17:15:56 -0800
Subject: [PATCH 092/264] wip: binned bitmap for the free slices

---
 src/arena.c  |  57 +++++----
 src/bitmap.c | 324 ++++++++++++++++++++++++++++++++++++++++++++++-----
 src/bitmap.h |  95 ++++++++++++++-
 3 files changed, 423 insertions(+), 53 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index d8b882d3..84db2fb0 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -44,7 +44,7 @@ typedef struct mi_arena_s {
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
 
-  mi_bitmap_t*        slices_free;          // is the slice free?
+  mi_bbitmap_t*       slices_free;          // is the slice free? (a binned bitmap with size classes)
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
   mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
   mi_bitmap_t*        slices_purge;         // slices that can be purged
@@ -213,7 +213,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
 {
   size_t slice_index;
-  if (!mi_bitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
+  if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
 
   // claimed it!
   void* p = mi_arena_slice_start(arena, slice_index);
@@ -267,7 +267,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
   }
 
-  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
   if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); }
   mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
   
@@ -574,7 +574,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
       _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
 
       _mi_page_free_collect(page, false);  // update `used` count
-      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
       mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
@@ -775,7 +775,7 @@ void _mi_arena_page_free(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
     mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
@@ -812,7 +812,7 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
     mi_assert_internal(!mi_page_is_singleton(page));
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
@@ -867,7 +867,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     size_t slice_count;
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
@@ -935,7 +935,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
     }
 
     // and make it available to others again
-    bool all_inuse = mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
+    bool all_inuse = mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
     if (!all_inuse) {
       _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
       return;
@@ -1051,8 +1051,8 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas
   if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
   mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
   const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE);
-  const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded
-  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL);
+  const size_t bitmaps_count = 4 + MI_BIN_COUNT; // commit, dirty, purge, and abandonded
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL) + mi_bbitmap_size(slice_count, NULL); // + free
   const size_t size = base_size + bitmaps_size;
 
   const size_t os_page_size = _mi_os_page_size();
@@ -1069,6 +1069,12 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
   return bitmap;
 }
 
+static mi_bbitmap_t* mi_arena_bbitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bbitmap_t* bbitmap = (mi_bbitmap_t*)(*base);
+  *base = (*base) + mi_bbitmap_init(bbitmap, slice_count, true /* already zero */);
+  return bbitmap;
+}
+
 
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
@@ -1121,7 +1127,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
 
   // init bitmaps
   uint8_t* base = mi_arena_start(arena) + bitmap_base;
-  arena->slices_free = mi_arena_bitmap_init(slice_count,&base);
+  arena->slices_free = mi_arena_bbitmap_init(slice_count,&base);
   arena->slices_committed = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base);
   arena->slices_purge = mi_arena_bitmap_init(slice_count, &base);
@@ -1132,7 +1138,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena)));
 
   // reserve our meta info (and reserve slices outside the memory area)
-  mi_bitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
+  mi_bbitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
   if (memid.initially_committed) {
     mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count);
   }
@@ -1225,7 +1231,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
       else if (_mi_meta_is_meta_page(start)) { c = 'm'; }
       else if (slice_index + bit < arena->info_slices) { c = 'i'; }
       // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
-      else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) {
+      else if (mi_bbitmap_is_setN(arena->slices_free, slice_index+bit, 1)) {
         if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; }
         else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
         else { c = '.'; }
@@ -1237,14 +1243,14 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
+static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, bool invert, mi_arena_t* arena) {
   _mi_output_message("%s:\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
-  for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
+  for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) {
     char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     size_t k = 0;
-    mi_bchunk_t* chunk = &bitmap->chunks[i];
+    mi_bchunk_t* chunk = &chunks[i];
 
     if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
     else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
@@ -1276,6 +1282,15 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
   return bit_set_count;
 }
 
+static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
+  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], invert, arena);
+}
+
+static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) {
+  return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], invert, arena);
+}
+
+
 void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept {
   size_t max_arenas = mi_arena_get_count();
   size_t free_total = 0;
@@ -1288,7 +1303,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
+      free_total += mi_debug_show_bbitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
     if (show_committed) {
       mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
@@ -1391,7 +1406,7 @@ static long mi_arena_purge_delay(void) {
 // assumes we own the area (i.e. slices_free is claimed by us)
 static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(!arena->memid.is_pinned);
-  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
 
   const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
@@ -1417,7 +1432,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
   const long delay = mi_arena_purge_delay();
   if (delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
 
-  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
   if (delay == 0) {
     // purge directly
     mi_arena_purge(arena, slice_index, slice_count);
@@ -1443,11 +1458,11 @@ typedef struct mi_purge_visit_info_s {
 } mi_purge_visit_info_t;
 
 static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
-  if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) {
+  if (mi_bbitmap_try_clearN(arena->slices_free, slice_index, slice_count)) {
     // purge
     mi_arena_purge(arena, slice_index, slice_count);
     // and reset the free range
-    mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
+    mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
     return true;
   }
   else {
diff --git a/src/bitmap.c b/src/bitmap.c
index 2734e2b2..4a0c4a60 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -477,9 +477,9 @@ static inline __m256i mi_mm256_zero(void) {
 static inline __m256i mi_mm256_ones(void) {
   return _mm256_set1_epi64x(~0);
 }
-//static inline bool mi_mm256_is_ones(__m256i vec) {
-//  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
-//}
+static inline bool mi_mm256_is_ones(__m256i vec) {
+  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
+}
 static inline bool mi_mm256_is_zero( __m256i vec) {
   return _mm256_testz_si256(vec,vec);
 }
@@ -706,7 +706,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
 
       const size_t bmask = mask<<idx;
       mi_assert_internal(bmask>>idx == mask);
-      if ((b&bmask) == bmask) { // found a match        
+      if ((b&bmask) == bmask) { // found a match
         if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) {
           *pidx = (i*MI_BFIELD_BITS) + idx;
           mi_assert_internal(*pidx < MI_BCHUNK_BITS);
@@ -837,6 +837,24 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   #endif
 }
 
+// are all bits in a bitmap chunk set?
+static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) {
+#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_ones(vec);
+#elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2)));
+#else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
+  return true;
+#endif
+}
+
 
 static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
   for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
@@ -902,6 +920,7 @@ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
   return size;
 }
 
+
 // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
 // returns the size of the bitmap
 size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) {
@@ -915,38 +934,33 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero)
   return size;
 }
 
-// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
 
-  // first chunk
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t total = n;
+
+
+  // start chunk and index
   size_t chunk_idx = idx / MI_BCHUNK_BITS;
   const size_t cidx = idx % MI_BCHUNK_BITS;
+  const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS);
+  
+  // first update the chunkmap
+  mi_bchunk_setN(cmap, chunk_idx, ccount, NULL);
+
+  // first chunk
   size_t m = MI_BCHUNK_BITS - cidx;
   if (m > n) { m = n; }
-  mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL);
-  mi_bitmap_chunkmap_set(bitmap, chunk_idx);
+  mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL);
 
   // n can be large so use memset for efficiency for all in-between chunks
   chunk_idx++;
   n -= m;
   const size_t mid_chunks = n / MI_BCHUNK_BITS;
   if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
-    const size_t end_chunk = chunk_idx + mid_chunks;
-    while (chunk_idx < end_chunk) {
-      if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) {
-        // optimize: we can set a full bfield in the chunkmap
-        mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set());
-        mi_bitmap_chunkmap_set(bitmap, chunk_idx + MI_BFIELD_BITS - 1);  // track the max set
-        chunk_idx += MI_BFIELD_BITS;
-      }
-      else {
-        mi_bitmap_chunkmap_set(bitmap, chunk_idx);
-        chunk_idx++;
-      }
-    }
+    _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
+    chunk_idx += mid_chunks;
     n -= (mid_chunks * MI_BCHUNK_BITS);
   }
 
@@ -954,12 +968,15 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
   if (n > 0) {
     mi_assert_internal(n < MI_BCHUNK_BITS);
     mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
-    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
+    mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL);
   }
+}
 
-  // reset max_accessed
-  mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
+  mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n);
 }
 
 
@@ -1085,7 +1102,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
   mi_assert_internal(start <= cycle); \
   mi_assert_internal(start < MI_BFIELD_BITS); \
-  mi_assert_internal(cycle < MI_BFIELD_BITS); \
+  mi_assert_internal(cycle <= MI_BFIELD_BITS); \
   mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
   size_t _bcount##SUF = mi_bfield_popcount(bfield); \
   mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
@@ -1250,7 +1267,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
 
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
-mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
   mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag)
 {
   mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag };
@@ -1351,3 +1368,248 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
   return true;
 }
 
+
+
+/* --------------------------------------------------------------------------------
+  binned bitmap's
+-------------------------------------------------------------------------------- */
+
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) {
+  mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
+}
+
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bbitmap_size(bit_count, &chunk_count);
+  if (!already_zero) {
+    _mi_memzero_aligned(bbitmap, size);
+  }
+  mi_atomic_store_release(&bbitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
+}
+
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+  mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+ binned bitmap chunkmap
+-------------------------------------------------------------------------------- */
+
+static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  if mi_unlikely(chunk_idx > oldmax) {
+    mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx);
+  }
+}
+
+static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (check_all_set) {
+    if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) {
+      // all slices are free in this chunk: return back to the NONE bin
+      mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE);
+    }
+  }
+  mi_bchunk_set(&bbitmap->chunkmap, chunk_idx);
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+}
+
+static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation.
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bbitmap->chunkmap, chunk_idx);
+    return false;
+  }
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+  return true;
+}
+
+// Assign from the NONE bin to a specific size bin
+static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) {
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin);
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_setN, try_clearN, and is_xsetN
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
+
+  const bool were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, n, NULL);
+  mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true);   // set after
+  return were_allclear;
+}
+
+
+// ------- mi_bbitmap_try_clearN ---------------------------------------
+
+bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (cidx + n > MI_BCHUNK_BITS) return false;
+  bool maybe_all_clear;
+  const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+  if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); }
+  // note: we don't set the size class for an explicit try_clearN (only used by purging)
+  return cleared;
+}
+
+
+// ------- mi_bbitmap_is_xset ---------------------------------------
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; }  // paranoia
+
+  return mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, n);
+}
+
+
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_find
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
+
+// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find)
+{
+  // we space out threads to reduce contention
+  const size_t cmap_max_count  = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS);
+  const size_t chunk_acc       = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  const size_t cmap_acc        = chunk_acc / MI_BFIELD_BITS;
+  const size_t cmap_acc_bits   = 1 + (chunk_acc % MI_BFIELD_BITS);
+
+  // create a mask over the chunkmap entries to iterate over them efficiently
+  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
+  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
+  const size_t cmap_cycle      = cmap_acc+1;
+  const mi_bbin_t bbin = mi_bbin_of(n);
+  // visit bins from largest size bin up to the NONE bin
+  // for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
+  const mi_bbin_t bin = bbin;
+  {
+    mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
+    {
+      // don't search into non-accessed memory until we tried other size bins as well
+      //if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { 
+      //  break; 
+      //}
+
+      // and for each chunkmap entry we iterate over its bits to find the chunks
+      const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
+      const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+      mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
+      {
+        mi_assert_internal(eidx <= MI_BFIELD_BITS);
+        const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
+        mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+        // only in the current size class!
+        const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]);
+        if (bin >= chunk_bin) { // || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
+          mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
+          size_t cidx;
+          if ((*on_find)(chunk, n, &cidx)) {
+            if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin
+              // this chunk is now reserved for the `bbin` size class
+              mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
+            }
+            *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+            mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+            return true;
+          }
+          else {
+            /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
+            mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);            
+          }
+        }
+      }
+      mi_bfield_cycle_iterate_end(Y);
+    }
+    mi_bfield_cycle_iterate_end(X);
+  }
+  return false;
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_try_find_and_clear -- used to find free pages
+  note: the compiler will fully inline the indirect function calls
+-------------------------------------------------------------------------------- */
+
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
+}
+
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
+}
+
+bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
+}
+
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
+}
+
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_);
+}
diff --git a/src/bitmap.h b/src/bitmap.h
index 4afcdaf1..b28a09e4 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -36,7 +36,7 @@ Concurrent bitmap that can set/reset sequences of bits atomically
       This is used to avoid scanning every chunk. (and thus strictly an optimization)
       It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
       to have no bits set. It is also allowed to briefly have a clear bit even if the
-      chunk has bits set -- as long as we guarantee that the bit will be set later on; 
+      chunk has bits set -- as long as we guarantee that the bit will be set later on;
       (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).
 
       However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
@@ -236,4 +236,97 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a
 // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
 bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 
+//
+typedef enum mi_bbin_e {
+  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
+  MI_BBIN_SMALL,    // slice_count == 1
+  MI_BBIN_MEDIUM,   // slice_count == 8
+  MI_BBIN_OTHER,    // slice_count > 1, and not 8
+  MI_BBIN_COUNT
+} mi_bbin_t;
+
+static inline mi_bbin_t mi_bbin_of(size_t n) {
+  return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER));
+}
+
+// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT];        // 512b
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bbitmap_t;
+
+
+static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
+  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
+}
+
+static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
+  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
+}
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
+
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
+}
+
+
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
+bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+
+// Specialized versions for common bit sequence sizes
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
+bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+
+// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
+  if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
+  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
+}
+
+
 #endif // MI_BITMAP_H

From e24217e69cb37d3b2087b7489ca4b2c6bc40f7d7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 18:35:12 -0800
Subject: [PATCH 093/264] more bbin size classes, bug fixes

---
 src/arena-meta.c   |  12 ++---
 src/arena.c        |  27 ++++++++--
 src/bitmap.c       | 119 +++++++--------------------------------------
 src/bitmap.h       |  48 +++++-------------
 test/test-stress.c |   2 +-
 5 files changed, 59 insertions(+), 149 deletions(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index ceda06ba..86a89755 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -33,7 +33,7 @@ terms of the MIT license. A copy of the license can be found in the file
 typedef struct mi_meta_page_s  {
   _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
   mi_memid_t                       memid;   // provenance of the meta-page memory itself
-  mi_bitmap_t                      blocks_free;  // a small bitmap with 1 bit per block.
+  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
 } mi_meta_page_t;
 
 static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
@@ -76,11 +76,11 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
 
   // initialize the page
   mpage->memid = memid;
-  mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
   const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
   const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
   mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE);
-  mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks);
+  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks);
 
   // push atomically in front of the meta page list
   // (note: there is no ABA issue since we never free meta-pages)
@@ -104,7 +104,7 @@ mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
   mi_meta_page_t* mpage = mpage0;
   while (mpage != NULL) {
     size_t block_idx;
-    if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
       // found and claimed `block_count` blocks
       *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
       return mi_meta_block_start(mpage,block_idx);
@@ -122,7 +122,7 @@ mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
   mpage = mi_meta_page_zalloc();
   if (mpage != NULL) {
     size_t block_idx;
-    if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
       // found and claimed `block_count` blocks
       *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
       return mi_meta_block_start(mpage,block_idx);
@@ -145,7 +145,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
     // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
-    mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL);
+    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
   }
   else if (mi_memid_is_os(memid)) {
     _mi_os_free(p, size, memid);    
diff --git a/src/arena.c b/src/arena.c
index 84db2fb0..1547c9b2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1243,7 +1243,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t
   return bit_set_count;
 }
 
-static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, bool invert, mi_arena_t* arena) {
+static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) {
   _mi_output_message("%s:\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
@@ -1256,9 +1256,22 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_
     else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
     else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
 
+    char chunk_kind = ' ';
+    if (chunk_bins != NULL) {
+      switch (chunk_bins[i]) {
+        // case MI_BBIN_SMALL: chunk_kind  = 'S'; break;
+        case MI_BBIN_MEDIUM: chunk_kind = 'M'; break;
+        case MI_BBIN_LARGE: chunk_kind  = 'L'; break;
+        case MI_BBIN_OTHER: chunk_kind  = 'O'; break;
+        // case MI_BBIN_NONE: chunk_kind = 'N'; break;
+      }
+    }
+    buf[k++] = chunk_kind;
+    buf[k++] = ' ';
+
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
-        buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
+        buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7;
       }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
@@ -1283,11 +1296,15 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_
 }
 
 static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
-  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], invert, arena);
+  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], NULL, invert, arena);
+}
+
+static size_t mi_debug_show_bitmap_binned(const char* header, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) {
+  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena);
 }
 
 static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) {
-  return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], invert, arena);
+  return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], &bbitmap->chunk_bins[0], invert, arena);
 }
 
 
@@ -1313,7 +1330,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     //}
     if (show_pages) {
-      page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena);
+      page_total += mi_debug_show_bitmap_binned("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
diff --git a/src/bitmap.c b/src/bitmap.c
index 4a0c4a60..a847740b 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -938,9 +938,7 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero)
 // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
 static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  const size_t total = n;
-
-
+  
   // start chunk and index
   size_t chunk_idx = idx / MI_BCHUNK_BITS;
   const size_t cidx = idx % MI_BCHUNK_BITS;
@@ -984,29 +982,6 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 
 // ------- mi_bitmap_xset ---------------------------------------
 
-// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t cidx = idx % MI_BCHUNK_BITS;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
-  mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
-  return wasclear;
-}
-
-bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t cidx = idx % MI_BCHUNK_BITS;
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  bool maybe_all_clear;
-  const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear);
-  if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-  return wasset;
-}
-
-
 // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) {
@@ -1043,24 +1018,17 @@ bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
 }
 
 
-// ------- mi_bitmap_try_clearN ---------------------------------------
-
-bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BCHUNK_BITS);
-  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
-
-  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
-  const size_t cidx = idx % MI_BCHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
-  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  if (cidx + n > MI_BCHUNK_BITS) return false;
-  bool maybe_all_clear;
-  const bool cleared = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
-  if (cleared && maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
-  return cleared;
+// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_setN(bitmap, idx, 1, NULL);
 }
 
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_clearN(bitmap, idx, 1);
+}
+
+
+
 // ------- mi_bitmap_is_xset ---------------------------------------
 
 // Is a sequence of n bits already all set/cleared?
@@ -1170,58 +1138,6 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si
 }
 
 
-/* --------------------------------------------------------------------------------
-  mi_bitmap_try_find_and_clear -- used to find free pages
-  note: the compiler will fully inline the indirect function calls
--------------------------------------------------------------------------------- */
-
-
-typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
-
-static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) {
-  MI_UNUSED(arg2);
-  mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear = (mi_bchunk_try_find_and_clear_fun_t*)arg1;
-  size_t cidx;
-  // if we find a spot in the chunk we are done
-  if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) {
-    *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-    mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
-    return true;
-  }
-  else {
-    /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
-    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-    return false;
-  }
-}
-
-static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) {
-  return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL);
-}
-
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
-}
-
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
-}
-
-bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
-}
-
-bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
-  mi_assert_internal(n<=MI_BFIELD_BITS);
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
-}
-
-bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
-  mi_assert_internal(n<=MI_BCHUNK_BITS);
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_);
-}
-
-
 /* --------------------------------------------------------------------------------
   Bitmap: try_find_and_claim  -- used to allocate abandoned pages
   note: the compiler will fully inline the indirect function call
@@ -1267,7 +1183,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
 
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
-bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
   mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag)
 {
   mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag };
@@ -1541,15 +1457,15 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
   const size_t cmap_cycle      = cmap_acc+1;
   const mi_bbin_t bbin = mi_bbin_of(n);
   // visit bins from largest size bin up to the NONE bin
-  // for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
-  const mi_bbin_t bin = bbin;
+  for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
+  // const mi_bbin_t bin = bbin;
   {
     mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
     {
       // don't search into non-accessed memory until we tried other size bins as well
-      //if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { 
-      //  break; 
-      //}
+      if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { 
+        break; 
+      }
 
       // and for each chunkmap entry we iterate over its bits to find the chunks
       const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
@@ -1561,7 +1477,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
         mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
         // only in the current size class!
         const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]);
-        if (bin >= chunk_bin) { // || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
+        if // (bin >= chunk_bin) { 
+           (bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
           mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
           size_t cidx;
           if ((*on_find)(chunk, n, &cidx)) {
diff --git a/src/bitmap.h b/src/bitmap.h
index b28a09e4..f221f2cd 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -136,13 +136,13 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 // Not atomic so only use if still local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
+
 // Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
 bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
 
 // Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
 bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
 
-
 // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 // If `already_set` is not NULL, it is set to count of bits were already all set.
@@ -177,36 +177,6 @@ static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
 }
 
 
-// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
-// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
-bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Try to atomically transition a bit from set to clear. Returns `true` on succes.
-static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
-  return mi_bitmap_try_clearN(bitmap, idx, 1);
-}
-
-
-
-// Specialized versions for common bit sequence sizes
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
-  if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
-  if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
-  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
-  return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
-}
-
-
 // Called once a bit is cleared to see if the memory slice can be claimed.
 typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set);
 
@@ -225,6 +195,7 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 
 // If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
 // Otherwise return `false` (and `*idx` is undefined).
+// Used for unloading arena's
 bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
 
 
@@ -236,17 +207,24 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a
 // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
 bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 
-//
+
+/* ----------------------------------------------------------------------------
+  Binned concurrent bitmap
+  Assigns a size class to each chunk such that small blocks don't cause too
+  much fragmentation by keeping chunks for larger blocks separate.
+---------------------------------------------------------------------------- */
+
 typedef enum mi_bbin_e {
   MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
   MI_BBIN_SMALL,    // slice_count == 1
   MI_BBIN_MEDIUM,   // slice_count == 8
-  MI_BBIN_OTHER,    // slice_count > 1, and not 8
+  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS
+  MI_BBIN_OTHER,    // slice_count > 1, and not 8 or MI_BFIELD_BITS
   MI_BBIN_COUNT
 } mi_bbin_t;
 
 static inline mi_bbin_t mi_bbin_of(size_t n) {
-  return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER));
+  return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : (n==64 ? MI_BBIN_LARGE : MI_BBIN_OTHER)));
 }
 
 // An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes
@@ -308,8 +286,6 @@ static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_
 // `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
 bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-
-
 // Specialized versions for common bit sequence sizes
 bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
 bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
diff --git a/test/test-stress.c b/test/test-stress.c
index 277f9e6e..1996e52e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -353,7 +353,7 @@ int main(int argc, char** argv) {
   mi_debug_show_arenas(true,false,false);
   #else
   //mi_collect(true);
-  //mi_debug_show_arenas(true,false,false);
+  mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif
 #else

From 3330d4353aeb5e5785d3a16a3b70d738a7c0c696 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 19:15:00 -0800
Subject: [PATCH 094/264] remove maxaccessed from general bitmaps

---
 src/arena-meta.c   |  4 ++--
 src/arena.c        |  2 +-
 src/bitmap.c       | 60 +++++++++++++++++++---------------------------
 src/bitmap.h       |  3 +--
 src/heap.c         |  4 +++-
 src/page-map.c     |  2 +-
 src/page.c         | 14 +++++++----
 test/test-stress.c |  4 ++--
 8 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index 86a89755..49195e22 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -77,7 +77,7 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // initialize the page
   mpage->memid = memid;
   mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
-  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
   const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
   mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE);
   mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks);
@@ -142,7 +142,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
     mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
     mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE);
-    mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
     // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
     mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
diff --git a/src/arena.c b/src/arena.c
index 1547c9b2..9bc12272 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1258,7 +1258,7 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_
 
     char chunk_kind = ' ';
     if (chunk_bins != NULL) {
-      switch (chunk_bins[i]) {
+      switch (mi_atomic_load_relaxed(&chunk_bins[i])) {
         // case MI_BBIN_SMALL: chunk_kind  = 'S'; break;
         case MI_BBIN_MEDIUM: chunk_kind = 'M'; break;
         case MI_BBIN_LARGE: chunk_kind  = 'L'; break;
diff --git a/src/bitmap.c b/src/bitmap.c
index a847740b..ccc17514 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -45,6 +45,14 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
+// find the most significant bit that is set.
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x, idx);
+}
+
+
 
 // find each set bit in a bit field `x` and clear it, until it becomes zero.
 static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
@@ -873,17 +881,9 @@ static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
  bitmap chunkmap
 -------------------------------------------------------------------------------- */
 
-static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
-  if mi_unlikely(chunk_idx > oldmax) {
-    mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
-  }
-}
-
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
-  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);  
 }
 
 static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@@ -898,13 +898,12 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
-  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
   return true;
 }
 
 
 /* --------------------------------------------------------------------------------
- bitmap
+  bitmap
 -------------------------------------------------------------------------------- */
 
 size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
@@ -1107,33 +1106,24 @@ typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size
 // If it returns `true` stop the search.
 static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2)
 {
-  // we space out threads to reduce contention
-  const size_t cmap_max_count  = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS);
-  const size_t chunk_acc       = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
-  const size_t cmap_acc        = chunk_acc / MI_BFIELD_BITS;
-  const size_t cmap_acc_bits   = 1 + (chunk_acc % MI_BFIELD_BITS);
-
-  // create a mask over the chunkmap entries to iterate over them efficiently
-  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
-  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
-  const size_t cmap_cycle      = cmap_acc+1;
-  mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
-  {
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
     // and for each chunkmap entry we iterate over its bits to find the chunks
-    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]);
-    size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
-    mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
-    {
-      mi_assert_internal(eidx <= MI_BFIELD_BITS);
-      const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
-      mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-      if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
-        return true;
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t hi;
+    if (mi_bfield_find_highest_bit(cmap_entry, &hi)) {
+      mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
+      {
+        mi_assert_internal(eidx <= MI_BFIELD_BITS);
+        const size_t chunk_idx = i*MI_BFIELD_BITS + eidx;
+        mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+        if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
+          return true;
+        }
       }
+      mi_bfield_cycle_iterate_end(Y);
     }
-    mi_bfield_cycle_iterate_end(Y);
   }
-  mi_bfield_cycle_iterate_end(X);
   return false;
 }
 
@@ -1478,7 +1468,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
         // only in the current size class!
         const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]);
         if // (bin >= chunk_bin) { 
-           (bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
+           ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
           mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
           size_t cidx;
           if ((*on_find)(chunk, n, &cidx)) {
diff --git a/src/bitmap.h b/src/bitmap.h
index f221f2cd..62c42129 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -98,8 +98,7 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
   _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
-  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
   mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
diff --git a/src/heap.c b/src/heap.c
index dee404d2..1c2b017b 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -138,7 +138,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect retired pages
   _mi_heap_collect_retired(heap, force);
-
+  
+  // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); }
+  
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   
diff --git a/src/page-map.c b/src/page-map.c
index 7b74c711..64f4bbbb 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -17,7 +17,7 @@ static mi_memid_t  mi_page_map_memid;
 
 
 // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
-static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
+static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT),
                                           { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
 
 bool _mi_page_map_init(void) {
diff --git a/src/page.c b/src/page.c
index a30db6c9..b3fdb78f 100644
--- a/src/page.c
+++ b/src/page.c
@@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
-  
+
   // const size_t bsize = mi_page_block_size(page);
   // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
@@ -623,7 +623,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   #endif
   mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
-  
+
   // initialize an initial free list
   mi_page_extend_free(heap,page);
   mi_assert(mi_page_immediate_available(page));
@@ -872,10 +872,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_heap_is_initialized(heap));
 
   // call potential deferred free routines
-  // _mi_deferred_free(heap, false);
+  _mi_deferred_free(heap, false);
 
-  // free delayed frees from other threads (but skip contended ones)
-  // _mi_heap_delayed_free_partial(heap);
+  // collect every N generic mallocs
+  /*static long count = 0;
+  if (count++ > 100000) {
+    count = 0;
+    _mi_heap_collect_retired(heap,false);
+  }*/
 
   // find (or allocate) a page of the right size
   mi_page_t* page = mi_find_page(heap, size, huge_alignment);
diff --git a/test/test-stress.c b/test/test-stress.c
index 1996e52e..ae1e83b6 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -262,7 +262,7 @@ static void test_stress(void) {
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { 
       printf("- iterations left: %3d\n", ITER - (n + 1)); 
-      //mi_debug_show_arenas(true, false, false);
+      mi_debug_show_arenas(true, false, false);
       //mi_collect(true);
       //mi_debug_show_arenas(true, false, false);
     }
@@ -352,7 +352,7 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,false,false);
   #else
-  //mi_collect(true);
+  mi_collect(false);
   mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif

From d2f670e6e50d8bba25b8471e0793da070c4251d2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 15 Dec 2024 19:54:01 -0800
Subject: [PATCH 095/264] add delay to purg'ing; call collect_retired every N
 generic allocs

---
 include/mimalloc/types.h |  1 +
 src/arena.c              | 33 ++++++++++++++-------------------
 src/init.c               |  2 ++
 src/page.c               | 15 +++++++--------
 4 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index bf91a58a..057195a1 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -409,6 +409,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   mi_memid_t            memid;                               // provenance of the heap struct itseft (meta or os)
+  long                  generic_count;
   long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
diff --git a/src/arena.c b/src/arena.c
index 9bc12272..8feb165b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -43,6 +43,7 @@ typedef struct mi_arena_s {
   bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
+  _Atomic(mi_msecs_t) purge_expire_delay;   // 
 
   mi_bbitmap_t*       slices_free;          // is the slice free? (a binned bitmap with size classes)
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
@@ -54,13 +55,6 @@ typedef struct mi_arena_s {
   // followed by the bitmaps (whose sizes depend on the arena size)
 } mi_arena_t;
 
-// Every "page" in `pages_purge` points to purge info 
-// (since we use it for any free'd range and not just for pages)
-typedef struct mi_purge_info_s {
-  _Atomic(mi_msecs_t)  expire;
-  _Atomic(size_t)      slice_count;
-} mi_purge_info_t;
-
 
 #define MI_MAX_ARENAS         (160)         // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
                                             // 160 arenas is enough for ~2 TiB memory
@@ -208,13 +202,17 @@ static size_t mi_memid_size(mi_memid_t memid) {
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
 
 static mi_decl_noinline void* mi_arena_try_alloc_at(
   mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
 {
   size_t slice_index;
   if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
-
+  
   // claimed it!
   void* p = mi_arena_slice_start(arena, slice_index);
   *memid = mi_memid_create_arena(arena, slice_index, slice_count);
@@ -422,7 +420,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
   mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
-  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);  
   void* p;
 again:
   // try to find free slices in the arena's
@@ -949,7 +947,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
     mi_assert_internal(mi_memid_needs_no_free(memid));
   }
 
-  // purge expired decommits
+  // try to purge expired decommits
   mi_arenas_try_purge(false, false);
 }
 
@@ -1123,6 +1121,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
+  arena->purge_expire_delay = 0;
   // mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
@@ -1414,11 +1413,6 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   Arena purge
 ----------------------------------------------------------- */
 
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
-
 // reset or decommit in an arena and update the commit bitmap
 // assumes we own the area (i.e. slices_free is claimed by us)
 static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
@@ -1459,10 +1453,11 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
     mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
     if (expire == 0) {
       mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+      mi_atomic_storei64_release(&arena->purge_expire_delay, 0);
+    }
+    else if (mi_atomic_loadi64_acquire(&arena->purge_expire_delay) < 10*delay) {
+      mi_atomic_addi64_acq_rel(&arena->purge_expire_delay, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
-    //else {
-    //  mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
-    //}
     mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
   }
 }
@@ -1509,7 +1504,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
   // check pre-conditions
   if (arena->memid.is_pinned) return false;
-  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire) + mi_atomic_loadi64_relaxed(&arena->purge_expire_delay);
   if (expire == 0) return false;
 
   // expired yet?
diff --git a/src/init.c b/src/init.c
index 9a26d56f..4465d603 100644
--- a/src/init.c
+++ b/src/init.c
@@ -108,6 +108,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
   MI_MEMID_STATIC,  // memid
+  0,
   0,                // full page retain
   false,            // can reclaim
   true,             // can eager abandon
@@ -156,6 +157,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
   MI_MEMID_STATIC,  // memid
+  0, 
   2,                // full page retain
   true,             // allow page reclaim
   true,             // allow page abandon
diff --git a/src/page.c b/src/page.c
index b3fdb78f..53773aae 100644
--- a/src/page.c
+++ b/src/page.c
@@ -870,16 +870,15 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
-
-  // call potential deferred free routines
-  _mi_deferred_free(heap, false);
-
+  
   // collect every N generic mallocs
-  /*static long count = 0;
-  if (count++ > 100000) {
-    count = 0;
+  if (heap->generic_count++ > 10000) {
+    heap->generic_count = 0;
+    // call potential deferred free routines
+    _mi_deferred_free(heap, false);
+    // collect retired pages
     _mi_heap_collect_retired(heap,false);
-  }*/
+  }
 
   // find (or allocate) a page of the right size
   mi_page_t* page = mi_find_page(heap, size, huge_alignment);

From 037cb167f8d49aa903a950ee38b494ff8bd563a4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 16 Dec 2024 09:51:54 -0800
Subject: [PATCH 096/264] comments

---
 include/mimalloc/types.h | 12 ++++++------
 src/bitmap.c             | 19 +++++++++++--------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 057195a1..f8615d1c 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -100,7 +100,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Sizes are for 64-bit
 #ifndef MI_ARENA_SLICE_SHIFT
-#ifdef  MI_SMALL_PAGE_SHIFT   // compatibility
+#ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
 #define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
 #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
@@ -149,7 +149,7 @@ typedef struct mi_arena_s mi_arena_t;     // defined in `arena.c`
 // a memory id tracks the provenance of arena/OS allocated memory
 // ---------------------------------------------------------------
 
-// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated. 
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
 // The memid keeps track of this.
 typedef enum mi_memkind_e {
   MI_MEM_NONE,      // not allocated
@@ -264,7 +264,7 @@ typedef uint8_t mi_heaptag_t;
 //
 // We don't count `freed` (as |free|) but use `used` to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
-// 
+//
 // Notes:
 // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
 // - If a page is not part of a heap it is called "abandoned" -- in
@@ -310,7 +310,7 @@ typedef struct mi_page_s {
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
 #define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
-#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks 
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
 #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
@@ -348,12 +348,12 @@ typedef enum mi_page_kind_e {
 
 // ------------------------------------------------------
 // Heaps
-// 
+//
 // Provide first-class heaps to allocate from.
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// 
+//
 // Per thread, there is always a default heap that is
 // used for allocation; it is initialized to statically
 // point to an empty heap to avoid initialization checks
diff --git a/src/bitmap.c b/src/bitmap.c
index ccc17514..be4f8d76 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -883,7 +883,7 @@ static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
 
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);  
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
 }
 
 static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@@ -937,12 +937,12 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero)
 // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
 static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
   mi_assert_internal(n>0);
-  
+
   // start chunk and index
   size_t chunk_idx = idx / MI_BCHUNK_BITS;
   const size_t cidx = idx % MI_BCHUNK_BITS;
   const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS);
-  
+
   // first update the chunkmap
   mi_bchunk_setN(cmap, chunk_idx, ccount, NULL);
 
@@ -1433,6 +1433,9 @@ typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n,
 
 // Go through the bbitmap and for every sequence of `n` set bits, call the visitor function.
 // If it returns `true` stop the search.
+//
+// This is used for finding free blocks and it is important to be efficient (with 2-level bitscan)
+// but also reduce fragmentation (through size bins).
 static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find)
 {
   // we space out threads to reduce contention
@@ -1453,8 +1456,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
     mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
     {
       // don't search into non-accessed memory until we tried other size bins as well
-      if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { 
-        break; 
+      if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) {
+        break;
       }
 
       // and for each chunkmap entry we iterate over its bits to find the chunks
@@ -1466,8 +1469,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
         const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
         mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
         // only in the current size class!
-        const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]);
-        if // (bin >= chunk_bin) { 
+        const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]);
+        if // (bin >= chunk_bin) {
            ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
           mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
           size_t cidx;
@@ -1482,7 +1485,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
           }
           else {
             /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
-            mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);            
+            mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
           }
         }
       }

From d9397be17803e0408944329d63151ae57827e582 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 16 Dec 2024 10:00:32 -0800
Subject: [PATCH 097/264] comments

---
 include/mimalloc/types.h | 4 ++--
 src/bitmap.h             | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index f8615d1c..920a8e2c 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -321,9 +321,9 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 8 KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/6)   // < 11 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 1 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 62c42129..4faaa3a1 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -210,15 +210,20 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
 /* ----------------------------------------------------------------------------
   Binned concurrent bitmap
   Assigns a size class to each chunk such that small blocks don't cause too
-  much fragmentation by keeping chunks for larger blocks separate.
+  much fragmentation since we keep chunks for larger blocks separate.
 ---------------------------------------------------------------------------- */
 
+// Size bins; larger bins are allowed to go into smaller bins.
+// Since LARGE and MEDIUM are aligned (on word and byte boundaries respectively),
+// they are larger than OTHER even though those can contain very large objects (but we
+// don't want those in the MEDIUM or LARGE bins as these are variable size).
+// SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
 typedef enum mi_bbin_e {
   MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
   MI_BBIN_SMALL,    // slice_count == 1
+  MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
   MI_BBIN_MEDIUM,   // slice_count == 8
   MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS
-  MI_BBIN_OTHER,    // slice_count > 1, and not 8 or MI_BFIELD_BITS
   MI_BBIN_COUNT
 } mi_bbin_t;
 

From 98171fd80a34dfb5433a6efc5aab3537c6efa6a7 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 17 Dec 2024 00:24:32 -0800
Subject: [PATCH 098/264] testing on arm64

---
 test/test-stress.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index 61aeabf5..e138ffb0 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -306,6 +306,9 @@ int main(int argc, char** argv) {
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
     // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
   #endif
+  #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // mi_option_set(mi_option_purge_delay,-1);
+  #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();
   #endif
@@ -352,8 +355,8 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,false,false);
   #else
-  mi_collect(false);
-  mi_debug_show_arenas(true,false,false);
+  // mi_collect(false);
+  // mi_debug_show_arenas(true,false,false);
   // mi_stats_print(NULL);
   #endif
 #else

From fdad1a0d4f2d04eeed0b10f46afd2c0609e3f204 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 09:49:09 -0800
Subject: [PATCH 099/264] fix infoslices needed calculation

---
 src/arena.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index d8b882d3..c5846329 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -52,6 +52,7 @@ typedef struct mi_arena_s {
   mi_bitmap_t*        pages_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
                                             // the full queue contains abandoned full pages
   // followed by the bitmaps (whose sizes depend on the arena size)
+  // note: when adding bitmaps revise `mi_arena_info_slices_needed`
 } mi_arena_t;
 
 // Every "page" in `pages_purge` points to purge info 
@@ -1051,7 +1052,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas
   if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
   mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
   const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE);
-  const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded
+  const size_t bitmaps_count = 5 + MI_ARENA_BIN_COUNT; // free, commit, dirty, purge, pages, and abandoned
   const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL);
   const size_t size = base_size + bitmaps_size;
 
@@ -1303,7 +1304,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
   // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total);
-  if (show_pages)     _mi_output_message("total pages in areanas: %zu\n", page_total);
+  if (show_pages)     _mi_output_message("total pages in arenas: %zu\n", page_total);
 }
 
 

From adfeb1f6f296ec23950fea90d7946e71ac0cf6de Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 10:43:31 -0800
Subject: [PATCH 100/264] fix bug in bitmap_forall_ranges

---
 src/arena.c        | 30 +++++++++++++++++++-----------
 src/bitmap.c       | 18 ++++++++----------
 test/test-stress.c |  5 +++--
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index c5846329..962e1898 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -43,6 +43,7 @@ typedef struct mi_arena_s {
   bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
+  _Atomic(mi_msecs_t) purge_expire_extend;  // the purge expiration may be extended by a bit
 
   mi_bitmap_t*        slices_free;          // is the slice free?
   mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
@@ -950,7 +951,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
     mi_assert_internal(mi_memid_needs_no_free(memid));
   }
 
-  // purge expired decommits
+  // try to purge expired decommits
   mi_arenas_try_purge(false, false);
 }
 
@@ -1118,6 +1119,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
+  arena->purge_expire_extend = 0;
   // mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
@@ -1402,7 +1404,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
     needs_recommit = _mi_os_purge(p, size);
   }
   else {
-    mi_assert_internal(false); // ?
+    mi_assert_internal(false); // can this happen?
   }
 
   // update committed bitmap
@@ -1428,10 +1430,11 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
     mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
     if (expire == 0) {
       mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+      mi_atomic_storei64_release(&arena->purge_expire_extend, 0);      
+    }
+    else if (mi_atomic_loadi64_acquire(&arena->purge_expire_extend) < 10*delay) {     // limit max extension time
+      mi_atomic_addi64_acq_rel(&arena->purge_expire_extend, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
-    //else {
-    //  mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
-    //}
     mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
   }
 }
@@ -1478,26 +1481,31 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
   // check pre-conditions
   if (arena->memid.is_pinned) return false;
-  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  mi_msecs_t expire_base = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  mi_msecs_t expire_extend = mi_atomic_loadi64_relaxed(&arena->purge_expire_extend);
+  const mi_msecs_t expire = expire_base + expire_extend;
   if (expire == 0) return false;
 
   // expired yet?
   if (!force && expire > now) return false;
 
   // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+  if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) {
+    mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend
+  }
   _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
 
-  // go through all purge info's
-  // todo: instead of visiting per-bit, we should visit per range of bits
+  // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
-  _mi_bitmap_forall_set(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
+  _mi_bitmap_forall_set_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
 
   // if not fully purged, make sure to purge again in the future
   if (!vinfo.all_purged) {
     const long delay = mi_arena_purge_delay();
     mi_msecs_t expected = 0;
-    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay);
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay)) {
+      mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0);
+    }
   }
   return vinfo.any_purged;
 }
diff --git a/src/bitmap.c b/src/bitmap.c
index 2734e2b2..9440df31 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1332,18 +1332,16 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
       for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
         const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
         mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
-        size_t bshift = 0;
         size_t bidx;
-        while (mi_bfield_find_least_bit(b, &bidx)) {
-          b >>= bidx;
-          bshift += bidx;
-          const size_t rng = mi_ctz(~b); // all the set bits from bidx
-          mi_assert_internal(rng>=1);
-          const size_t idx = base_idx + bshift + bidx;
+        while (mi_bfield_find_least_bit(b, &bidx)) {          
+          const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
+          mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
+          const size_t idx = base_idx + bidx;
+          mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
+          mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap));
           if (!visit(idx, rng, arena, arg)) return false;
-          // skip rng
-          b >>= rng;
-          bshift += rng;
+          // clear rng bits in b
+          b = b & ~mi_bfield_mask(rng, bidx);          
         }
       }
     }
diff --git a/test/test-stress.c b/test/test-stress.c
index 9122d70e..1591b38e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -305,6 +305,7 @@ int main(int argc, char** argv) {
   #endif
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
     // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    mi_option_set(mi_option_purge_delay,10);
   #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();
@@ -353,8 +354,8 @@ int main(int argc, char** argv) {
   mi_debug_show_arenas(true,false,false);
   #else
   //mi_collect(true);
-  //mi_debug_show_arenas(true,false,false);
-  // mi_stats_print(NULL);
+  mi_debug_show_arenas(true,false,false);
+  mi_stats_print(NULL);
   #endif
 #else
   mi_stats_print(NULL);  // so we see rss/commit/elapsed

From c585753dcef498ebc1934b062dfb4d0a69306d05 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 11:54:26 -0800
Subject: [PATCH 101/264] fix purging with ranges

---
 src/arena.c        | 19 +++++++++----------
 src/bitmap.c       | 16 ++++++++++++----
 test/test-stress.c |  2 +-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 962e1898..cb4936d4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1398,14 +1398,8 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 
   const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
-  bool needs_recommit = false;  // reset needs no recommit, decommit does need it
-  if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) {
-    // all slices are committed, we can purge the entire range
-    needs_recommit = _mi_os_purge(p, size);
-  }
-  else {
-    mi_assert_internal(false); // can this happen?
-  }
+  const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed);
 
   // update committed bitmap
   if (needs_recommit) {
@@ -1450,11 +1444,13 @@ static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size
   if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) {
     // purge
     mi_arena_purge(arena, slice_index, slice_count);
+    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count));
     // and reset the free range
     mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
     return true;
   }
   else {
+    // was allocated again already
     return false;
   }
 }
@@ -1463,12 +1459,15 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m
   mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg;  
   // try to purge: first claim the free blocks
   if (mi_arena_try_purge_range(arena, slice_index, slice_count)) {
-    vinfo->any_purged = true;    
+    vinfo->any_purged = true;
+    vinfo->all_purged = true;
   }
   else {
     // failed to claim the full range, try per slice instead
     for (size_t i = 0; i < slice_count; i++) {
-      vinfo->any_purged = vinfo->any_purged || mi_arena_try_purge_range(arena, slice_index + i, 1);
+      const bool purged = mi_arena_try_purge_range(arena, slice_index + i, 1);
+      vinfo->any_purged = vinfo->any_purged || purged;
+      vinfo->all_purged = vinfo->all_purged && purged;
     }
   }
   // done: clear the purge bits
diff --git a/src/bitmap.c b/src/bitmap.c
index 9440df31..fb334d8a 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -81,7 +81,7 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
   const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask);
   return ((old&mask) == 0);
 }
@@ -90,7 +90,7 @@ static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
 // `all_clear` is set if the new bfield is zero.
 static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
   mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
   if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
   return ((old&mask) == mask);
@@ -101,7 +101,7 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bo
 // happen almost never (and is accounted for in the stats)
 static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
   mi_bfield_t old = mi_atomic_load_relaxed(b);
   do {
     if mi_unlikely((old&mask) == 0) {
@@ -1085,7 +1085,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
   mi_assert_internal(start <= cycle); \
   mi_assert_internal(start < MI_BFIELD_BITS); \
-  mi_assert_internal(cycle < MI_BFIELD_BITS); \
+  mi_assert_internal(cycle <= MI_BFIELD_BITS); \
   mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
   size_t _bcount##SUF = mi_bfield_popcount(bfield); \
   mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
@@ -1332,9 +1332,16 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
       for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
         const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
         mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        #if MI_DEBUG > 1
+        const size_t bpopcount = mi_popcount(b);
+        size_t rngcount = 0;
+        #endif
         size_t bidx;
         while (mi_bfield_find_least_bit(b, &bidx)) {          
           const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
+          #if MI_DEBUG > 1
+          rngcount += rng;
+          #endif  
           mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
           const size_t idx = base_idx + bidx;
           mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
@@ -1343,6 +1350,7 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
           // clear rng bits in b
           b = b & ~mi_bfield_mask(rng, bidx);          
         }
+        mi_assert_internal(rngcount == bpopcount);
       }
     }
   }
diff --git a/test/test-stress.c b/test/test-stress.c
index 1591b38e..1f8df226 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -262,7 +262,7 @@ static void test_stress(void) {
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { 
       printf("- iterations left: %3d\n", ITER - (n + 1)); 
-      //mi_debug_show_arenas(true, false, false);
+      mi_debug_show_arenas(true, false, false);
       //mi_collect(true);
       //mi_debug_show_arenas(true, false, false);
     }

From 34d03f3981670cca6620fc1d7b4e97bf691ac893 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 12:32:18 -0800
Subject: [PATCH 102/264] atomically clear purge bits when visiting

---
 src/arena.c  | 33 +++++++++++++++------------------
 src/bitmap.c |  7 ++++---
 src/bitmap.h |  2 +-
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index cb4936d4..b6e98863 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1392,19 +1392,21 @@ static long mi_arena_purge_delay(void) {
 
 // reset or decommit in an arena and update the commit bitmap
 // assumes we own the area (i.e. slices_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+// returns if the memory is no longer committed (versus reset which keeps the commit)
+static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(!arena->memid.is_pinned);
   mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
 
   const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
   const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
-  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed);
+  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */);
 
   // update committed bitmap
   if (needs_recommit) {
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
+  return needs_recommit;
 }
 
 
@@ -1421,12 +1423,13 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
   }
   else {
     // schedule purge
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire == 0) {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+    mi_msecs_t expire0 = 0;
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, _mi_clock_now() + delay)) {
+      // expiration was not yet set
       mi_atomic_storei64_release(&arena->purge_expire_extend, 0);      
     }
     else if (mi_atomic_loadi64_acquire(&arena->purge_expire_extend) < 10*delay) {     // limit max extension time
+      // already an expiration was set
       mi_atomic_addi64_acq_rel(&arena->purge_expire_extend, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
     mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
@@ -1443,8 +1446,8 @@ typedef struct mi_purge_visit_info_s {
 static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) {
     // purge
-    mi_arena_purge(arena, slice_index, slice_count);
-    mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count));
+    bool decommitted = mi_arena_purge(arena, slice_index, slice_count); MI_UNUSED(decommitted);
+    mi_assert_internal(!decommitted || mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count));
     // and reset the free range
     mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL);
     return true;
@@ -1470,8 +1473,8 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m
       vinfo->all_purged = vinfo->all_purged && purged;
     }
   }
-  // done: clear the purge bits
-  mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
+  // don't clear the purge bits as that is done atomically be the _bitmap_forall_set_ranges
+  // mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
   return true; // continue
 }
 
@@ -1495,17 +1498,11 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
 
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
+  // this also clears those ranges atomically (so any newly freed blocks will get purged next
+  // time around)
   mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
-  _mi_bitmap_forall_set_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
+  _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
 
-  // if not fully purged, make sure to purge again in the future
-  if (!vinfo.all_purged) {
-    const long delay = mi_arena_purge_delay();
-    mi_msecs_t expected = 0;
-    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay)) {
-      mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0);
-    }
-  }
   return vinfo.any_purged;
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index fb334d8a..a534bba5 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1316,9 +1316,10 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a
 }
 
 // Visit all set bits in a bitmap but try to return ranges (within bfields) if possible.
-// used by purging to purge larger ranges if possible
+// Also clear those ranges atomically.
+// Used by purging to purge larger ranges when possible
 // todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
-bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
   // for all chunkmap entries
   const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
   for (size_t i = 0; i < chunkmap_max; i++) {
@@ -1331,7 +1332,7 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi
       mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
       for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
         const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
-        mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        mi_bfield_t b = mi_atomic_exchange_acq_rel(&chunk->bfields[j], 0); // can be relaxed?
         #if MI_DEBUG > 1
         const size_t bpopcount = mi_popcount(b);
         size_t rngcount = 0;
diff --git a/src/bitmap.h b/src/bitmap.h
index 4afcdaf1..47c22025 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -234,6 +234,6 @@ typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_ar
 bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 
 // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
-bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 
 #endif // MI_BITMAP_H

From 84bb1c2712d1b01f828597e1620ef2280db1a8b7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 18:10:28 -0800
Subject: [PATCH 103/264] adjust stats more clearly to avoid double counting
 commits

---
 src/arena.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index b6e98863..e7564cd6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -234,12 +234,24 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
     if (!mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) {
       // not fully committed: commit the full range and set the commit bits
-      // (this may race and we may double-commit which is fine)
+      // we set the bits first since we own these slices (they are no longer free)
+      size_t already_committed_count = 0;
+      mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
+      // adjust the stats so we don't double count the commits
+      if (already_committed_count > 0) {
+        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+      }
+      // now actually commit
       bool commit_zero = false;
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
+        // failed to commit (todo: give warning?)
+        if (already_committed_count > 0) {
+          _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+        }
         memid->initially_committed = false;
       }
       else {
+        // committed
         if (commit_zero) { memid->initially_zero = true; }
         #if MI_DEBUG > 1
         if (memid->initially_zero) {
@@ -248,13 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
             memid->initially_zero = false;
           }
         }
-        #endif
-        size_t already_committed_count = 0;
-        mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
-        if (already_committed_count < slice_count) {
-          // todo: also decrease total
-          mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
-        }
+        #endif        
       }
     }
     if (memid->initially_zero) {
@@ -798,7 +804,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   Arena abandon
 ----------------------------------------------------------- */
 
-static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
+void _mi_arena_page_abandon(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -827,12 +833,8 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) {
     // page is full (or a singleton), page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
   }
-  _mi_page_unown(page);
-}
-
-void _mi_arena_page_abandon(mi_page_t* page) {
-  mi_arena_page_abandon_no_stat(page);
   _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
+  _mi_page_unown(page);
 }
 
 bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
@@ -849,7 +851,8 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
   }
   else {
     _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
-    mi_arena_page_abandon_no_stat(page);
+    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1);  // adjust as we are not abandoning fresh
+    _mi_arena_page_abandon(page);
     return true;
   }
 }

From 58b726be6f814906870738ac225e119350c7e243 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 17 Dec 2024 18:57:00 -0800
Subject: [PATCH 104/264] better stats for commit on overcommit systems (by not
 counting on-demand commit upfront)

---
 include/mimalloc/types.h | 12 ++++++------
 src/arena.c              | 39 +++++++++++++++++++++++++++++++++------
 src/stats.c              | 22 ++++++++++++----------
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 4c998f90..0cf909d0 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -488,8 +488,8 @@ typedef struct mi_stats_s {
 void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
 void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
@@ -497,14 +497,14 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
 #define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
 #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amount)  _mi_stat_adjust_increase( &(stat), amount)
-#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
+#define mi_stat_adjust_increase(stat,amnt,b)  _mi_stat_adjust_increase( &(stat), amnt, b)
+#define mi_stat_adjust_decrease(stat,amnt,b)  _mi_stat_adjust_decrease( &(stat), amnt, b)
 #else
 #define mi_stat_increase(stat,amount)         ((void)0)
 #define mi_stat_decrease(stat,amount)         ((void)0)
 #define mi_stat_counter_increase(stat,amount) ((void)0)
-#define mi_stat_adjuct_increase(stat,amount)  ((void)0)
-#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
+#define mi_stat_adjuct_increase(stat,amnt,b)  ((void)0)
+#define mi_stat_adjust_decrease(stat,amnt,b)  ((void)0)
 #endif
 
 #define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
diff --git a/src/arena.c b/src/arena.c
index e7564cd6..29279b86 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -222,9 +222,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   *memid = mi_memid_create_arena(arena, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
 
-  // set the dirty bits
+  // set the dirty bits and track which slices become accessible
+  size_t touched_slices = slice_count;
   if (arena->memid.initially_zero) {
-    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);
+    size_t already_dirty = 0;
+    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, &already_dirty);
+    mi_assert_internal(already_dirty <= touched_slices);
+    touched_slices -= already_dirty;
   }
 
   // set commit state
@@ -239,7 +243,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
       // adjust the stats so we don't double count the commits
       if (already_committed_count > 0) {
-        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
       }
       // now actually commit
       bool commit_zero = false;
@@ -263,6 +267,15 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
         #endif        
       }
     }
+    else {
+      // already fully commited.
+      // if the OS has overcommit, and this is the first time we access these pages, then 
+      // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
+      if (_mi_os_has_overcommit() && touched_slices > 0) {
+        _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices));
+      }
+    }
+    // tool support
     if (memid->initially_zero) {
       mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
     }
@@ -324,17 +337,25 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 
   // commit eagerly?
   bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
+  const bool overcommit = _mi_os_has_overcommit();
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = overcommit; }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
+  // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
+  // is actually allocated for the first time it will be counted.
+  const bool adjust = (overcommit && arena_commit);
+  if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
   int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
+    if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
     const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
+    if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); }
     if (arena_reserve > small_arena_reserve) {
       // try again
       err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+      if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back      
     }
   }
   return (err==0);
@@ -851,7 +872,7 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
   }
   else {
     _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
-    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1);  // adjust as we are not abandoning fresh
+    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
     _mi_arena_page_abandon(page);
     return true;
   }
@@ -1402,7 +1423,13 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 
   const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
-  const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+  //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+  size_t already_committed;
+  mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed);
+  const bool all_committed = (already_committed == slice_count);
+  if (mi_option_is_enabled(mi_option_purge_decommits)) {
+    _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */);
+  }  
   const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */);
 
   // update committed bitmap
diff --git a/src/stats.c b/src/stats.c
index 53937330..bb17b936 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -54,21 +54,23 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
 // Adjust stats to compensate; for example before committing a range,
 // first adjust downwards with parts that were already committed so 
 // we avoid double counting.
-static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
   if mi_unlikely(mi_is_in_main(stat))
   {
     // adjust atomically 
     mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed(&stat->allocated, amount);
-    mi_atomic_addi64_relaxed(&stat->freed, amount);
+    mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
   }
   else {
     // don't affect the peak
     stat->current += amount;    
-    // add to both
-    stat->allocated += amount;
-    stat->freed += amount;    
+    if (on_alloc) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += amount;
+    }
   }
 }
 
@@ -91,12 +93,12 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, (int64_t)amount);
+void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }
 
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, -((int64_t)amount));
+void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
 }
 
 // must be thread safe as it is called from stats_merge

From fb9093840897749cc567c798a628d9e8f6399956 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 19:11:23 -0800
Subject: [PATCH 105/264] adjust stats more clearly to avoid double counting
 commits

---
 src/arena.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index d02d4760..044d3f39 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1451,13 +1451,11 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
   size_t already_committed;
   mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed);
   const bool all_committed = (already_committed == slice_count);
-  if (mi_option_is_enabled(mi_option_purge_decommits)) {
-    _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */);
-  }  
   const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */);
 
   // update committed bitmap
   if (needs_recommit) {
+    _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
   return needs_recommit;

From 264d5a67049e346eb27bb0b6009c44a14de48509 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 17 Dec 2024 19:13:03 -0800
Subject: [PATCH 106/264] update stat adjustment for purging

---
 src/arena.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 29279b86..44c909c1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1427,13 +1427,11 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
   size_t already_committed;
   mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed);
   const bool all_committed = (already_committed == slice_count);
-  if (mi_option_is_enabled(mi_option_purge_decommits)) {
-    _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */);
-  }  
   const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */);
 
   // update committed bitmap
   if (needs_recommit) {
+    _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
   return needs_recommit;

From de8001c107a18c3139d0e024095f7206ccf1b1e6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 19 Dec 2024 19:18:04 -0800
Subject: [PATCH 107/264] add specialized is_set for 1 bit

---
 src/bitmap.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index a534bba5..6fae1ed6 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -200,21 +200,40 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all
 
 // ------- mi_bfield_atomic_is_set ---------------------------------------
 
+// Check if a bit is set
+static inline bool mi_bfield_atomic_is_set(_Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mi_bfield_mask(1,idx)) != 0);
+}
+
+// Check if a bit is clear
+static inline bool mi_bfield_atomic_is_clear(_Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mi_bfield_mask(1, idx)) == 0);
+}
+
+// Check if a bit is xset
+static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, const size_t idx) {
+  if (set) return mi_bfield_atomic_is_set(b, idx);
+      else return mi_bfield_atomic_is_clear(b, idx);
+}
 
 // Check if all bits corresponding to a mask are set.
-static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
-  return ((*b & mask) == mask);
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mask) == mask);
 }
 
 // Check if all bits corresponding to a mask are clear.
-static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
-  return ((*b & mask) == 0);
+  const mi_bfield_t x = mi_atomic_load_relaxed(b);
+  return ((x & mask) == 0);
 }
 
 // Check if all bits corresponding to a mask are set/cleared.
-static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
   mi_assert_internal(mask != 0);
   if (set) return mi_bfield_atomic_is_set_mask(b, mask);
       else return mi_bfield_atomic_is_clear_mask(b, mask);
@@ -359,12 +378,9 @@ static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t
   if (n==0) return true;
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  if mi_likely(n<=MI_BFIELD_BITS) {
-    return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx));
-  }
-  else {
-    return mi_bchunk_is_xsetN_(set, chunk, i, idx, n);
-  }
+  if mi_likely(n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
+  if mi_likely(n<=MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  return mi_bchunk_is_xsetN_(set, chunk, i, idx, n);
 }
 
 

From 3746bf79edb5f09283b336d4b556a5a519cfea75 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 19 Dec 2024 21:30:03 -0800
Subject: [PATCH 108/264] small fixes; max object size  1/8th of a pages

---
 include/mimalloc/types.h |  4 ++--
 src/arena.c              |  2 +-
 src/bitmap.c             | 10 +++++-----
 src/heap.c               |  8 ++++++--
 src/init.c               | 12 ++++++------
 src/page.c               |  2 +-
 6 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 085879bd..61681138 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -321,8 +321,8 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/6)   // < 11 KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 11 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 128 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 1 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
diff --git a/src/arena.c b/src/arena.c
index 044d3f39..e0044392 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -313,7 +313,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 
   if (arena_count >= 1 && arena_count <= 128) {
     // scale up the arena sizes exponentially every 4 entries
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/2, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
diff --git a/src/bitmap.c b/src/bitmap.c
index c64f227b..a04762af 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -352,7 +352,7 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, b
   if (n==1) return mi_bchunk_clear(chunk, cidx, pmaybe_all_clear);
   if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, pmaybe_all_clear);
   if (n <MI_BFIELD_BITS) return mi_bchunk_clearNX(chunk, cidx, n, pmaybe_all_clear);
-  return mi_bchunk_xsetN_(MI_BIT_CLEAR, chunk, cidx, n, NULL, pmaybe_all_clear); 
+  return mi_bchunk_xsetN_(MI_BIT_CLEAR, chunk, cidx, n, NULL, pmaybe_all_clear);
 }
 
 
@@ -596,7 +596,7 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n,
   return mi_bchunk_try_find_and_clear(chunk, pidx);
 }
 
-#if !MI_OPT_SIMD
+#if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512))
 static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
   const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
   if (!allow_all_set && (~b == 0)) return false;
@@ -1277,18 +1277,18 @@ bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* vis
         size_t rngcount = 0;
         #endif
         size_t bidx;
-        while (mi_bfield_find_least_bit(b, &bidx)) {          
+        while (mi_bfield_find_least_bit(b, &bidx)) {
           const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
           #if MI_DEBUG > 1
           rngcount += rng;
-          #endif  
+          #endif
           mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
           const size_t idx = base_idx + bidx;
           mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
           mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap));
           if (!visit(idx, rng, arena, arg)) return false;
           // clear rng bits in b
-          b = b & ~mi_bfield_mask(rng, bidx);          
+          b = b & ~mi_bfield_mask(rng, bidx);
         }
         mi_assert_internal(rngcount == bpopcount);
       }
diff --git a/src/heap.c b/src/heap.c
index 1c2b017b..a24b8356 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -166,8 +166,12 @@ void mi_collect(bool force) mi_attr_noexcept {
 ----------------------------------------------------------- */
 
 mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
-  return mi_prim_get_default_heap();
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    mi_thread_init();
+    heap = mi_prim_get_default_heap();
+  }
+  return heap;
 }
 
 static bool mi_heap_is_default(const mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 4465d603..241a3826 100644
--- a/src/init.c
+++ b/src/init.c
@@ -157,7 +157,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
   MI_MEMID_STATIC,  // memid
-  0, 
+  0,
   2,                // full page retain
   true,             // allow page reclaim
   true,             // allow page abandon
@@ -289,7 +289,7 @@ mi_decl_noinline mi_tld_t* _mi_tld(void) {
   }
   if (mi_tld==NULL) {
     mi_tld = mi_tld_alloc();
-  }  
+  }
   return mi_tld;
 }
 
@@ -361,11 +361,11 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // allocates tld data 
-    // note: we cannot access thread-locals yet as that can cause (recursive) allocation 
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
     // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
-    mi_tld_t* tld = mi_tld_alloc();  
-    
+    mi_tld_t* tld = mi_tld_alloc();
+
     // allocate and initialize the heap
     mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
 
diff --git a/src/page.c b/src/page.c
index 53773aae..0e3e9bb5 100644
--- a/src/page.c
+++ b/src/page.c
@@ -870,7 +870,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
-  
+
   // collect every N generic mallocs
   if (heap->generic_count++ > 10000) {
     heap->generic_count = 0;

From 2db407d1e9ce00bc8e6363e66d845fc9ec78628b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 20 Dec 2024 11:54:39 -0800
Subject: [PATCH 109/264] revert back to generating mimalloc.dll instead of
 mimalloc-override.dll

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e7a6aca..30a6b3e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -673,8 +673,8 @@ if (MI_OVERRIDE)
     target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
     if (WIN32)
       # on windows we should generate mimalloc-override.dll.
-      string(REPLACE "mimalloc" "mimalloc-override" mi_override_output_name ${mi_basename})
-      set_target_properties(mimalloc PROPERTIES OUTPUT_NAME ${mi_override_output_name})
+      # string(REPLACE "mimalloc" "mimalloc-override" mi_override_output_name ${mi_basename})
+      # set_target_properties(mimalloc PROPERTIES OUTPUT_NAME ${mi_override_output_name})
     endif()
   endif()
   if(NOT WIN32)

From 5614c5052ec5d7490391da3b98dc4bdcc0e1ed7c Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 20 Dec 2024 11:56:04 -0800
Subject: [PATCH 110/264] don't prefer high used candidate if it is too full

---
 src/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/page.c b/src/page.c
index a30db6c9..0de56752 100644
--- a/src/page.c
+++ b/src/page.c
@@ -682,7 +682,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         _mi_page_free(page_candidate, pq);
         page_candidate = page;
       }
-      else if (page->used >= page_candidate->used)  { // && !mi_page_is_mostly_used(page)) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate

From 7141d9f1642ff24f5d94e5ae3767f3212153f25f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 17:31:48 -0800
Subject: [PATCH 111/264] remove busy wait for arena reservation

---
 src/arena.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 44c909c1..74cd4977 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -453,7 +453,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   void* p;
-again:
+
   // try to find free slices in the arena's
   p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
   if (p != NULL) return p;
@@ -465,22 +465,25 @@ again:
   if (_mi_preloading()) return NULL;
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
-  if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
-    mi_arena_id_t arena_id = 0;
-    bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+  const size_t arena_count = mi_arena_get_count();
+  if (mi_lock_acquire(&mi_arena_reserve_lock)) {
+    bool ok = true;
+    if (arena_count == mi_arena_get_count()) {
+      // we are the first to enter the lock, reserve a fresh arena
+      mi_arena_id_t arena_id = 0;
+      ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+    }
+    else {
+      // another thread already reserved a new arena
+    }
     mi_lock_release(&mi_arena_reserve_lock);
     if (ok) {
-      // and try allocate in there
+      // try once more to allocate in the new arena
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
       p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
       if (p != NULL) return p;
     }
   }
-  else {
-    // if we are racing with another thread wait until the new arena is reserved (todo: a better yield?)
-    mi_atomic_yield();
-    goto again;
-  }
 
   return NULL;
 }

From a5b7d7f26461d0d241b6de41f215d63dbfa642cb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 21:38:31 -0800
Subject: [PATCH 112/264] subprocesses own arena's

---
 include/mimalloc.h          |   2 +-
 include/mimalloc/atomic.h   |   2 +-
 include/mimalloc/internal.h |  15 +-
 include/mimalloc/types.h    |  56 +++----
 src/alloc.c                 |   4 +-
 src/arena-meta.c            |   6 +-
 src/arena.c                 | 315 +++++++++++++++++-------------------
 src/bitmap.c                |   7 +-
 src/bitmap.h                |   4 +-
 src/free.c                  |   6 +-
 src/heap.c                  |   7 +-
 src/init.c                  | 259 ++++++++++++++++-------------
 src/page.c                  |   2 +-
 13 files changed, 351 insertions(+), 334 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 24217fae..7a58e54c 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -279,7 +279,7 @@ mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_commit
 mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
+typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 7dc492f6..ddb5a9a3 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -434,7 +434,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
   InitializeSRWLock(lock);
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
-  // nothing
+  (void)(lock);
 }
 
 
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index a5ca3e27..24792f8c 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -101,8 +101,10 @@ bool        _mi_is_main_thread(void);
 size_t      _mi_current_thread_count(void);
 bool        _mi_preloading(void);           // true while the C runtime is not initialized yet
 void        _mi_thread_done(mi_heap_t* heap);
-mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 
+mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
+mi_subproc_t* _mi_subproc(void);
+mi_subproc_t* _mi_subproc_main(void);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
 
@@ -142,10 +144,11 @@ void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t m
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void        _mi_arena_init(void);
-void*       _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
-void*       _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
-bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
+
+void*       _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void*       _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 bool        _mi_arena_contains(const void* p);
 void        _mi_arenas_collect(bool force_purge);
 void        _mi_arena_unsafe_destroy_all(void);
@@ -524,7 +527,7 @@ static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   if (heap != NULL) {
     page->heap = heap;
     page->heap_tag = heap->tag;
-    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+    mi_atomic_store_release(&page->xthread_id, heap->tld->thread_id);
   }
   else {
     page->heap = NULL;
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 0cf909d0..4d43e887 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -243,9 +243,6 @@ typedef size_t mi_page_flags_t;
 // atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 
-// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
-typedef struct mi_subproc_s mi_subproc_t;
-
 // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
 typedef uint8_t mi_heaptag_t;
 
@@ -299,7 +296,6 @@ typedef struct mi_page_s {
   mi_heap_t*                heap;              // heap this threads belong to.
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
-  mi_subproc_t*             subproc;           // sub-process of this heap
   mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
@@ -380,7 +376,7 @@ typedef struct mi_random_cxt_s {
 
 
 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
   uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
   uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@@ -397,10 +393,8 @@ typedef struct mi_padding_s {
 
 // A heap owns a set of pages.
 struct mi_heap_s {
-  mi_tld_t*             tld;
-  // _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  mi_tld_t*             tld;                                 // thread-local data
+  mi_arena_t*           exclusive_arena;                     // if the heap belongs to a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -408,7 +402,6 @@ struct mi_heap_s {
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
-  mi_memid_t            memid;                               // provenance of the heap struct itseft (meta or os)
   long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
@@ -421,7 +414,8 @@ struct mi_heap_s {
   size_t                guarded_sample_count;                // current sample count (counting down to 0)
   #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the heap struct itself (meta or os)
 };
 
 
@@ -479,7 +473,7 @@ typedef struct mi_stats_s {
   mi_stat_counter_t arena_count;
   mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+  mi_stat_count_t normal_bins[MI_BIN_COUNT];
 #endif
 } mi_stats_t;
 
@@ -513,19 +507,24 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
 
 // ------------------------------------------------------
-// Sub processes do not reclaim or visit segments
-// from other sub processes
+// Sub processes use separate arena's and no heaps/pages/blocks
+// are shared between sub processes. 
+// Each thread should also belong to one sub-process only
 // ------------------------------------------------------
 
-struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
-  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_page_t*         abandoned_os_list;       // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory)
-  mi_page_t*         abandoned_os_list_tail;  // the tail-end of the list
-  mi_memid_t         memid;                   // provenance of this memory block
-};
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
+
+typedef struct mi_subproc_s {
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+  mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+} mi_subproc_t;
+
 
 // ------------------------------------------------------
 // Thread Local data
@@ -534,20 +533,21 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 
-
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;        // monotonic heartbeat count
+  mi_threadid_t       thread_id;        // thread id of this thread
+  size_t              thread_seq;       // thread sequence id (linear count of created threads)
+  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
   mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
-  size_t              tseq;             // thread sequence id
-  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
+  unsigned long long  heartbeat;        // monotonic heartbeat count
   bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
   bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
   mi_stats_t          stats;            // statistics
+  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
 };
 
+
 /* -----------------------------------------------------------
   Error codes passed to `_mi_fatal_error`
   All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
diff --git a/src/alloc.c b/src/alloc.c
index 25d6f62e..e5f2b8ae 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -134,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   mi_assert(size <= MI_SMALL_SIZE_MAX);
   #if MI_DEBUG
   const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local
   #endif
   #if (MI_PADDING || MI_GUARDED)
   if (size == 0) { size = sizeof(void*); }
@@ -188,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
   else {
     // regular allocation
     mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id());   // heaps are thread local
     void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
     mi_track_malloc(p,size,zero);
 
diff --git a/src/arena-meta.c b/src/arena-meta.c
index ceda06ba..f28c50e9 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -64,10 +64,12 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
+  // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the 
+  // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet).
   mi_memid_t memid;
-  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
+  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
                                                                    true /* commit*/, true /* allow large */,
-                                                                   _mi_arena_id_none(), 0 /* tseq */, &memid );
+                                                                   NULL, 0 /* tseq */, &memid );
   if (mpage == NULL) return NULL;
   mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
diff --git a/src/arena.c b/src/arena.c
index 74cd4977..bb846da9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -35,7 +35,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
-  mi_arena_id_t       id;                   // arena id (> 0 where `arena == arenas[arena->id - 1]`)
+  mi_subproc_t*       subproc;              // subprocess this arena belongs to (`this 'in' this->subproc->arenas`)
 
   size_t              slice_count;          // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
@@ -64,64 +64,45 @@ typedef struct mi_purge_info_s {
 } mi_purge_info_t;
 
 
-#define MI_MAX_ARENAS         (160)         // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
-                                            // 160 arenas is enough for ~2 TiB memory
-
-// The available arenas
-static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
-
-
-static mi_lock_t mi_arena_reserve_lock;
-
-void _mi_arena_init(void) {
-  mi_lock_init(&mi_arena_reserve_lock);
-}
 
 /* -----------------------------------------------------------
   Arena id's
-  id = arena_index + 1
 ----------------------------------------------------------- */
 
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
+static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) {
+  return arena;
 }
 
 mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
+  return NULL;
 }
 
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id) {
+  return (mi_arena_t*)id;
 }
 
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+
+static bool mi_arena_id_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena) {
+  return ((arena == req_arena) ||                        // they match, 
+          (req_arena == NULL && !arena->is_exclusive));  // or the arena is not exclusive, and we didn't request a specific one
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena) {
   if (memid.memkind == MI_MEM_ARENA) {
-    const mi_arena_t* arena = memid.mem.arena.arena;
-    return mi_arena_id_is_suitable(arena->id, arena->is_exclusive, request_arena_id);
+    return mi_arena_id_is_suitable(memid.mem.arena.arena, request_arena);
   }
   else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+    return mi_arena_id_is_suitable(NULL, request_arena);
   }
 }
 
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
+size_t mi_arenas_get_count(mi_subproc_t* subproc) {
+  return mi_atomic_load_relaxed(&subproc->arena_count);
 }
 
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
-}
-
-mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
-  return mi_arena_from_index(mi_arena_id_index(id));
+mi_arena_t* mi_arena_from_index(mi_subproc_t* subproc, size_t idx) {
+  mi_assert_internal(idx < mi_arenas_get_count(subproc));
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &subproc->arenas[idx]);
 }
 
 static size_t mi_arena_info_slices(mi_arena_t* arena) {
@@ -159,9 +140,7 @@ uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) {
 // Arena area
 void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (size != NULL) *size = 0;
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
   if (arena == NULL) return NULL;
   if (size != NULL) { *size = mi_size_of_slices(arena->slice_count); }
   return mi_arena_start(arena);
@@ -297,12 +276,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
   // if (_mi_preloading()) return false;  // use OS only while pre loading
   if (req_arena_id != _mi_arena_id_none()) return false;
 
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  const size_t arena_count = mi_arenas_get_count(subproc);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
   // calc reserve
@@ -368,32 +347,27 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   Arena iteration
 ----------------------------------------------------------- */
 
-static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, int numa_node, bool allow_large) {
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_large) {
   if (!allow_large && arena->is_large) return false;
-  if (!mi_arena_id_is_suitable(arena->id, arena->is_exclusive, req_arena_id)) return false;
-  if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity
+  if (!mi_arena_id_is_suitable(arena, req_arena)) return false;
+  if (req_arena == NULL) { // if not specific, check numa affinity
     const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
     if (!numa_suitable) return false;
   }
   return true;
 }
 
-
-#define mi_forall_arenas(req_arena_id, tseq, name_arena) \
-  { \
-  const size_t _arena_count = mi_arena_get_count(); \
-  if (_arena_count > 0) { \
-    const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
-    size_t _start; \
-    if (req_arena_id == _mi_arena_id_none()) { \
-       /* always start searching in the arena's below the max */ \
-      _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
+#define mi_forall_arenas(subproc, req_arena, tseq, name_arena) { \
+  const size_t _arena_count = mi_arenas_get_count(subproc); \
+  const size_t _arena_cycle = (_arena_count == 0 ? 0 : _arena_count - 1); /* first search the arenas below the last one */ \
+  /* always start searching in the arena's below the max */ \
+  size_t _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
+  for (size_t _i = 0; _i < _arena_count; _i++) { \
+    mi_arena_t* name_arena; \
+    if (req_arena != NULL) { \
+      name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
     } \
     else { \
-      _start = mi_arena_id_index(req_arena_id); \
-      mi_assert_internal(_start < _arena_count); \
-    } \
-    for (size_t _i = 0; _i < _arena_count; _i++) { \
       size_t _idx; \
       if (_i < _arena_cycle) { \
         _idx = _i + _start; \
@@ -402,19 +376,20 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
       else { \
         _idx = _i; /* remaining arena's */ \
       } \
-      mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
-      if (name_arena != NULL) \
-      {
+      name_arena = mi_arena_from_index(subproc,_idx); \
+    } \
+    if (name_arena != NULL) \
+    {
 
 #define mi_forall_arenas_end()  \
-      } \
-      if (req_arena_id != _mi_arena_id_none()) break; \
     } \
-  }}
+    if (req_arena != NULL) break; \
+  } \
+  }
 
-#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \
-  mi_forall_arenas(req_arena_id,tseq,name_arena) { \
-    if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \
+#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \
+  mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \
+    if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \
 
 #define mi_forall_suitable_arenas_end() \
   }} \
@@ -425,17 +400,16 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 ----------------------------------------------------------- */
 
 // allocate slices from the arenas
-static mi_decl_noinline void* mi_arena_try_find_free(
-  size_t slice_count, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+static mi_decl_noinline void* mi_arenas_try_find_free(
+  mi_subproc_t* subproc, size_t slice_count, size_t alignment,
+  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
   // search arena's
-  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
   {
     void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
     if (p != NULL) return p;
@@ -445,42 +419,43 @@ static mi_decl_noinline void* mi_arena_try_find_free(
 }
 
 // Allocate slices from the arena's -- potentially allocating a fresh arena
-static mi_decl_noinline void* mi_arena_try_alloc(
+static mi_decl_noinline void* mi_arenas_try_alloc(
+  mi_subproc_t* subproc,
   size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   void* p;
 
   // try to find free slices in the arena's
-  p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
   if (p != NULL) return p;
 
   // did we need a specific arena?
-  if (req_arena_id != _mi_arena_id_none()) return NULL;
+  if (req_arena != NULL) return NULL;
 
   // don't create arena's while preloading (todo: or should we?)
   if (_mi_preloading()) return NULL;
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
-  const size_t arena_count = mi_arena_get_count();
-  if (mi_lock_acquire(&mi_arena_reserve_lock)) {
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  if (mi_lock_acquire(&subproc->arena_reserve_lock)) {
     bool ok = true;
-    if (arena_count == mi_arena_get_count()) {
+    if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+      ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
     }
     else {
       // another thread already reserved a new arena
     }
-    mi_lock_release(&mi_arena_reserve_lock);
+    mi_lock_release(&subproc->arena_reserve_lock);
     if (ok) {
       // try once more to allocate in the new arena
-      mi_assert_internal(req_arena_id == _mi_arena_id_none());
-      p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+      mi_assert_internal(req_arena == NULL);
+      p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
       if (p != NULL) return p;
     }
   }
@@ -510,10 +485,10 @@ static void* mi_arena_os_alloc_aligned(
 
 
 // Allocate large sized memory
-void* _mi_arena_alloc_aligned(
+void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
   size_t size, size_t alignment, size_t align_offset,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert_internal(memid != NULL);
   mi_assert_internal(size > 0);
@@ -522,24 +497,24 @@ void* _mi_arena_alloc_aligned(
   // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
-      req_arena_id == _mi_arena_id_none() &&                   // not a specific arena?
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&           // is arena allocation allowed?
+      req_arena == NULL &&                                               // not a specific arena?
       size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
       alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
     const size_t slice_count = mi_slice_count_of_size(size);
-    void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+    void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
     if (p != NULL) return p;
   }
 
   // fall back to the OS
-  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid);
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena, memid);
   return p;
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, tseq, memid);
+  return _mi_arena_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
 }
 
 
@@ -548,7 +523,7 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_abandoned) {
+static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_abandoned) {
   // found an abandoned page of the right size
   mi_page_t* const page  = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
@@ -560,9 +535,9 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
     *keep_abandoned = true;
     return false;
   }
-  if (subproc != page->subproc || heap_tag != page->heap_tag) {
-    // wrong sub-process or heap_tag.. we need to unown again
-    // note: this normally never happens unless subprocesses/heaptags are actually used.
+  if (heap_tag != page->heap_tag) {
+    // wrong heap_tag.. we need to unown again
+    // note: this normally never happens unless heaptags are actually used.
     // (an unown might free the page, and depending on that we can keep it in the abandoned map or not)
     // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point.
     //       so we cannot check in `mi_arena_free` for this invariant to hold.
@@ -570,31 +545,31 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
     *keep_abandoned = !freed;
     return false;
   }
-  // yes, we can reclaim it, keep the abandaned map entry clear
+  // yes, we can reclaim it, keep the abandoned map entry clear
   *keep_abandoned = false;
   return true;
 }
 
-static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_heaptag_t heaptag, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq)
 {
   MI_UNUSED(slice_count);
   const size_t bin = _mi_bin(block_size);
   mi_assert_internal(bin < MI_BIN_COUNT);
 
   // any abandoned in our size class?
-  mi_subproc_t* const subproc = tld->subproc;
   mi_assert_internal(subproc != NULL);
-  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL;
+  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) {
+    return NULL;
+  }
 
   // search arena's
   const bool allow_large = true;
-  size_t tseq = tld->tseq;
-  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
   {
     size_t slice_index;
     mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
 
-    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc, heaptag)) {
+    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, heaptag)) {
       // found an abandoned page of the right size
       // and claimed ownership.
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
@@ -621,8 +596,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   return NULL;
 }
 
-static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment,
-                                            mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
+                                            mi_arena_t* req_arena, size_t tseq)
 {
   const bool allow_large = true;
   const bool commit = true;
@@ -636,7 +611,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
-    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid);
+    page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid);
     if (page != NULL) {
       mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
       mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index);
@@ -648,10 +623,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     if (os_align) {
       // note: slice_count already includes the page
       mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid);
     }
     else {
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid);
     }
   }
 
@@ -724,17 +699,17 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
 }
 
 static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) {
-  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
 
   // 1. look for an abandoned page
-  mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, heap->tag, tld);
+  mi_page_t* page = mi_arena_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq);
   if (page != NULL) {
     return page;  // return as abandoned
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld);
+  page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -746,13 +721,13 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
 
 
 static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
-  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
   const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
 
-  mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);
+  mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
   if (page == NULL) return NULL;
 
   mi_assert(page != NULL);
@@ -836,7 +811,6 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   mi_assert_internal(!mi_page_all_free(page));
   mi_assert_internal(page->next==NULL);
 
-  mi_subproc_t* subproc = page->subproc;
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
@@ -851,7 +825,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_page_set_abandoned_mapped(page);
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]);
+    mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
@@ -902,7 +876,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
-    mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
+    mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
   }
   else {
     // page is full (or a singleton), page is OS/nly allocated
@@ -989,9 +963,10 @@ void _mi_arenas_collect(bool force_purge) {
 
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_arena_get_count();
+  mi_subproc_t* subproc = _mi_subproc();
+  const size_t max_arena = mi_arenas_get_count(subproc);
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) {
       return true;
     }
@@ -1007,14 +982,14 @@ bool _mi_arena_contains(const void* p) {
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_arena_get_count();
+static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
+  const size_t max_arena = mi_arenas_get_count(subproc);
   size_t new_max_arena = 0;
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena != NULL) {
       // mi_lock_done(&arena->abandoned_visit_lock);
-      mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
       if (mi_memkind_is_os(arena->memid.memkind)) {
         _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
       }
@@ -1023,14 +998,14 @@ static void mi_arenas_unsafe_destroy(void) {
 
   // try to lower the max arena.
   size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+  mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, new_max_arena);
 }
 
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(void) {
-  mi_arenas_unsafe_destroy();
+  mi_arenas_unsafe_destroy(_mi_subproc());
   _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
 
@@ -1039,40 +1014,36 @@ void _mi_arena_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
+  if (arena_id != NULL) { *arena_id = NULL; }
 
   // first try to find a NULL entry
-  const size_t count = mi_arena_get_count();
+  const size_t count = mi_arenas_get_count(subproc);
   size_t i;
   for (i = 0; i < count; i++) {
-    if (mi_arena_from_index(i) == NULL) {
-      arena->id = mi_arena_id_create(i);
+    if (mi_arena_from_index(subproc,i) == NULL) {
       mi_arena_t* expected = NULL;
-      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &mi_arenas[i], &expected, arena)) {
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[i], &expected, arena)) {
         // success
-        if (arena_id != NULL) { *arena_id = arena->id; }
+        if (arena_id != NULL) { *arena_id = arena; }
         return true;
-      }
-      else {
-        arena->id = _mi_arena_id_none();
-      }
+      }      
     }
   }
 
   // otherwise increase the max
-  i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  i = mi_atomic_increment_acq_rel(&subproc->arena_count);
   if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    mi_atomic_decrement_acq_rel(&subproc->arena_count);
+    arena->subproc = NULL;
     return false;
   }
 
   _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
+  mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena; }
   return true;
 }
 
@@ -1099,7 +1070,7 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
 }
 
 
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
@@ -1138,7 +1109,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   }
 
   // init
-  arena->id           = _mi_arena_id_none();
+  arena->subproc      = subproc;
   arena->memid        = memid;
   arena->is_exclusive = exclusive;
   arena->slice_count  = slice_count;
@@ -1176,7 +1147,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+  return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main);
 }
 
 
@@ -1187,7 +1158,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
   memid.initially_committed = is_committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, numa_node, exclusive, memid, arena_id);
 }
 
 // Reserve a range of regular OS memory
@@ -1198,7 +1169,7 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
     _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
@@ -1307,16 +1278,18 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
 }
 
 void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept {
-  size_t max_arenas = mi_arena_get_count();
+  mi_subproc_t* subproc = _mi_subproc();  
+  size_t max_arenas = mi_arenas_get_count(subproc);
   size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
   size_t page_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena == NULL) break;
+    mi_assert(arena->subproc == subproc);
     slice_total += arena->slice_count;
-    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc));
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
@@ -1342,7 +1315,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
 ----------------------------------------------------------- */
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
+  if (arena_id != NULL) *arena_id = NULL;
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
@@ -1356,7 +1329,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   }
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, true, numa_node, exclusive, memid, arena_id)) {
     _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }
@@ -1538,10 +1511,13 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 }
 
 
-static void mi_arenas_try_purge(bool force, bool visit_all) {
+static void mi_arenas_try_purge(bool force, bool visit_all) 
+{
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
-  const size_t max_arena = mi_arena_get_count();
+  mi_tld_t* tld = _mi_tld();
+  mi_subproc_t* subproc = tld->subproc;
+  const size_t max_arena = mi_arenas_get_count(subproc);
   if (max_arena == 0) return;
 
   // allow only one thread to purge at a time
@@ -1549,12 +1525,12 @@ static void mi_arenas_try_purge(bool force, bool visit_all) {
   mi_atomic_guard(&purge_guard)
   {
     const mi_msecs_t now = _mi_clock_now();
-    const size_t arena_start = _mi_tld()->tseq % max_arena;
+    const size_t arena_start = tld->thread_seq % max_arena;
     size_t max_purge_count = (visit_all ? max_arena : 1);
     for (size_t _i = 0; _i < max_arena; _i++) {
       size_t i = _i + arena_start;
       if (i >= max_arena) { i -= max_arena; }
-      mi_arena_t* arena = mi_arena_from_index(i);
+      mi_arena_t* arena = mi_arena_from_index(subproc,i);
       if (arena != NULL) {
         if (mi_arena_try_purge(arena, now, force)) {
           if (max_purge_count <= 1) break;
@@ -1590,13 +1566,7 @@ static bool mi_arena_pages_reregister(mi_arena_t* arena) {
 }
 
 mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
-  const size_t count = mi_arena_get_count();
-  const size_t arena_idx = mi_arena_id_index(arena_id);
-  if (count <= arena_idx) {
-    _mi_warning_message("arena id is invalid (%zu)\n", arena_id);
-    return false;
-  }
-  mi_arena_t* arena = mi_arena_from_id(arena_id);
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
   if (arena==NULL) {
     return false;
   }
@@ -1627,10 +1597,17 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   _mi_page_map_unregister_range(arena, asize);
 
   // set the entry to NULL
-  mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL);
-  if (arena_idx + 1 == count) { // try adjust the count?
-    size_t expected = count;
-    mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, count-1);
+  mi_subproc_t* subproc = arena->subproc;
+  const size_t count = mi_arenas_get_count(subproc);
+  for(size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc, i) == arena) {
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (i + 1 == count) { // try adjust the count?
+        size_t expected = count;
+        mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, count-1);
+      }
+      break;
+    }
   }
   return true;
 }
@@ -1662,8 +1639,8 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
   arena->memid.initially_zero = is_zero;
   arena->is_exclusive = true;
   arena->is_large = is_large;
-  arena->id = _mi_arena_id_none();
-  if (!mi_arena_add(arena, arena_id, &_mi_stats_main)) {
+  arena->subproc = NULL;
+  if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/bitmap.c b/src/bitmap.c
index 6fae1ed6..6352e4ea 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1228,7 +1228,6 @@ bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n,
 
 typedef struct mi_claim_fun_data_s {
   mi_arena_t*   arena;
-  mi_subproc_t* subproc;
   mi_heaptag_t  heap_tag;
 } mi_claim_fun_data_t;
 
@@ -1242,7 +1241,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
     const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
     mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
     bool keep_set = true;
-    if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) {
+    if ((*claim_fun)(slice_index, claim_data->arena, claim_data->heap_tag, &keep_set)) {
       // success!
       mi_assert_internal(!keep_set);
       *pidx = slice_index;
@@ -1267,9 +1266,9 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-  mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag)
+  mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag)
 {
-  mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag };
+  mi_claim_fun_data_t claim_data = { arena, heap_tag };
   return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
 }
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 47c22025..16ecea07 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -208,13 +208,13 @@ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t*
 
 
 // Called once a bit is cleared to see if the memory slice can be claimed.
-typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set);
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set);
 
 // Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
 // If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag );
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag );
 
 
 // Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
diff --git a/src/free.c b/src/free.c
index 14034593..770856da 100644
--- a/src/free.c
+++ b/src/free.c
@@ -210,7 +210,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-    _mi_arena_page_unabandon(page);
+      _mi_arena_page_unabandon(page);
     // we can free the page directly
     _mi_arena_page_free(page);
     return;
@@ -234,8 +234,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
       if ((tagheap != NULL) &&                         // don't reclaim across heap object types
           (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
-          (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-          (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+          // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
+          (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
       {
         if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
diff --git a/src/heap.c b/src/heap.c
index dee404d2..e8743691 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -178,7 +178,7 @@ mi_heap_t* mi_heap_get_backing(void) {
   mi_assert_internal(heap!=NULL);
   mi_heap_t* bheap = heap->tld->heap_backing;
   mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  mi_assert_internal(bheap->tld->thread_id == _mi_thread_id());
   return bheap;
 }
 
@@ -190,8 +190,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->memid = memid;
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
-  heap->thread_id  = _mi_thread_id();
-  heap->arena_id   = arena_id;
+  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
   heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
@@ -254,7 +253,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
 }
 
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+  return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena);
 }
 
 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 9a26d56f..a15a9c6c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -33,8 +33,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 },
   #endif
   NULL,       // xheap
-  NULL, NULL, // next, prev
-  NULL,       // subproc
+  NULL, NULL, // next, prev  
   MI_MEMID_STATIC  // memid
 };
 
@@ -96,27 +95,76 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
+static mi_decl_cache_align mi_subproc_t subproc_main;
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,      // subproc
+  NULL,                   // heap_backing
+  NULL,                   // heaps list  
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
-  // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free
-  0,                // thread_id
-  0,                // arena_id
-  0,                // cookie
-  { 0, 0 },         // keys
-  { {0}, {0}, 0, true }, // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next
-  MI_MEMID_STATIC,  // memid
-  0,                // full page retain
-  false,            // can reclaim
-  true,             // can eager abandon
-  0,                // tag
+  &tld_empty,         // tld
+  NULL,                   // exclusive_arena
+  0,                      // cookie
+  { 0, 0 },               // keys
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  NULL,                   // next
+  0,                      // full page retain
+  false,                  // can reclaim
+  true,                   // can eager abandon
+  0,                      // tag
   #if MI_GUARDED
-  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,          // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
   #endif
   MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
+};
+
+extern mi_heap_t heap_main;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,      // subproc
+  &heap_main,         // heap_backing
+  &heap_main,         // heaps list  
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align mi_heap_t heap_main = {
+  &tld_main,          // thread local data
+  0,                      // initial cookie
+  0,                      // arena id
+  { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  NULL,                   // next heap
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  0,                      // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC         
 };
 
 
@@ -124,49 +172,9 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
 }
 
-
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
-extern mi_heap_t _mi_heap_main;
-
-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-static mi_decl_cache_align mi_tld_t tld_main = {
-  0,
-  &_mi_heap_main,         // heap_backing
-  &_mi_heap_main,         // heaps list
-  &mi_subproc_default,    // subproc
-  0,                      // tseq
-  MI_MEMID_STATIC,        // memid
-  false,                  // recurse
-  false,                  // is_in_threadpool
-  { MI_STATS_NULL }       // stats
-};
-
-mi_decl_cache_align mi_heap_t _mi_heap_main = {
-  &tld_main,
-  // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free list
-  0,                // thread id
-  0,                // initial cookie
-  0,                // arena id
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0, true },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next heap
-  MI_MEMID_STATIC,  // memid
-  2,                // full page retain
-  true,             // allow page reclaim
-  true,             // allow page abandon
-  0,                // tag
-  #if MI_GUARDED
-  0, 0, 0, 0, 0,
-  #endif
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
-};
-
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 
 mi_stats_t _mi_stats_main = { MI_STATS_NULL };
@@ -210,30 +218,46 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 }
 #endif
 
-
-static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = 1;
-    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
-      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
-    #else
-      _mi_random_init(&_mi_heap_main.random);
-    #endif
-    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
-    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
-    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
-    _mi_heap_guarded_init(&_mi_heap_main);
-    _mi_heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
-    _mi_heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    mi_lock_init(&subproc_main.os_pages_lock);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
   }
 }
 
-mi_heap_t* _mi_heap_main_get(void) {
+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
+  }
+}
+
+// Initialization of the (statically allocated) main heap, and the main tld and subproc.
+static void mi_heap_main_init(void) {
+  if (heap_main.cookie == 0) {   
+    mi_subproc_main_init();
+    mi_tld_main_init();
+    // heap
+    heap_main.cookie = 1;
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&heap_main.random);
+    #endif
+    heap_main.cookie  = _mi_heap_random_next(&heap_main);
+    heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    heap_main.keys[1] = _mi_heap_random_next(&heap_main);    
+    _mi_heap_guarded_init(&heap_main);
+    heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
+    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+  }
+}
+
+mi_heap_t* heap_main_get(void) {
   mi_heap_main_init();
-  return &_mi_heap_main;
+  return &heap_main;
 }
 
 
@@ -265,8 +289,9 @@ static mi_tld_t* mi_tld_alloc(void) {
     tld->memid = memid;
     tld->heap_backing = NULL;
     tld->heaps = NULL;
-    tld->subproc = &mi_subproc_default;
-    tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->subproc = &subproc_main;
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1);
     tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
     return tld;
   }
@@ -291,12 +316,24 @@ mi_decl_noinline mi_tld_t* _mi_tld(void) {
   return mi_tld;
 }
 
+mi_subproc_t* _mi_subproc(void) {
+  if (_mi_is_main_thread()) {  // during initialization we should not recurse over reading the _mi_tld
+    return &subproc_main;  
+  }
+  else {
+    return _mi_tld()->subproc;
+  }
+}
 
 
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
 
+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
 mi_subproc_id_t mi_subproc_main(void) {
   return NULL;
 }
@@ -305,42 +342,41 @@ mi_subproc_id_t mi_subproc_new(void) {
   mi_memid_t memid;
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
   if (subproc == NULL) return NULL;
-  subproc->abandoned_os_list = NULL;
   subproc->memid = memid;
-  mi_lock_init(&subproc->abandoned_os_lock);
-  mi_lock_init(&subproc->abandoned_os_visit_lock);
+  mi_lock_init(&subproc->os_pages_lock);
+  mi_lock_init(&subproc->arena_reserve_lock);
   return subproc;
 }
 
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
-  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }
 
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   if (subproc_id == NULL) return;
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
-  // check if there are no abandoned segments still..
+  // check if there are os pages still..
   bool safe_to_delete = false;
-  if (mi_lock_acquire(&subproc->abandoned_os_lock)) {
-    if (subproc->abandoned_os_list == NULL) {
+  if (mi_lock_acquire(&subproc->os_pages_lock)) {
+    if (subproc->os_pages.first == NULL) {
       safe_to_delete = true;
     }
-    mi_lock_release(&subproc->abandoned_os_lock);
+    mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
   // safe to release
   // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->abandoned_os_lock);
-  mi_lock_done(&subproc->abandoned_os_visit_lock);
+  mi_lock_done(&subproc->os_pages_lock);
+  mi_lock_done(&subproc->arena_reserve_lock);
   _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_heap_t* heap = mi_heap_get_default();
-  if (heap == NULL) return;
-  mi_assert(heap->tld->subproc == &mi_subproc_default);
-  if (heap->tld->subproc != &mi_subproc_default) return;
-  heap->tld->subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* tld = _mi_tld();
+  if (tld == NULL) return;
+  mi_assert(tld->subproc == &subproc_main);
+  if (tld->subproc != &subproc_main) return;
+  tld->subproc = _mi_subproc_from_id(subproc_id);
 }
 
 
@@ -352,10 +388,10 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
 static bool _mi_thread_heap_init(void) {
   if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // mi_assert_internal(heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
+    _mi_heap_set_default_direct(&heap_main);
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
@@ -383,7 +419,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty);
 
   // switch to backing heap
   heap = heap->tld->heap_backing;
@@ -403,7 +439,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   mi_assert_internal(mi_heap_is_backing(heap));
 
   // collect if not the main thread
-  if (heap != &_mi_heap_main) {
+  if (heap != &heap_main) {
     _mi_heap_collect_abandon(heap);
   }
 
@@ -413,12 +449,12 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   // free heap meta data
   _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
 
-  if (heap == &_mi_heap_main) {
+  if (heap == &heap_main) {
     #if 0
     // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
     _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    mi_assert_internal(heap->tld->heap_backing == &heap_main);
     #endif
   }
 
@@ -449,12 +485,12 @@ static void mi_process_setup_auto_thread_done(void) {
   if (tls_initialized) return;
   tls_initialized = true;
   _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_heap_set_default_direct(&heap_main);
 }
 
 
 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
 static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
@@ -501,7 +537,7 @@ void _mi_thread_done(mi_heap_t* heap)
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
+  if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
   _mi_thread_heap_done(heap);  // returns true if already ran
@@ -560,7 +596,7 @@ void _mi_process_load(void) {
   }
 
   // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&heap_main.random);
 }
 
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@@ -587,7 +623,7 @@ void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
   static mi_atomic_once_t process_init;
 	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main
 	#endif
   if (!mi_atomic_once(&process_init)) return;
   _mi_process_is_initialized = true;
@@ -595,10 +631,11 @@ void mi_process_init(void) mi_attr_noexcept {
   mi_process_setup_auto_thread_done();
 
   mi_detect_cpu_features();
+  mi_subproc_main_init();
+  mi_tld_main_init();
+  mi_heap_main_init();
   _mi_os_init();
   _mi_page_map_init();
-  _mi_arena_init();
-  mi_heap_main_init();
   #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
@@ -609,7 +646,7 @@ void mi_process_init(void) mi_attr_noexcept {
   #endif
   mi_thread_init();
 
-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
   // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
   // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
   // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
@@ -670,7 +707,7 @@ void mi_cdecl _mi_process_done(void) {
     mi_stats_print(NULL);
   }
   _mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
 
diff --git a/src/page.c b/src/page.c
index d97537d1..0444b47e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -591,7 +591,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
 void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
   mi_page_set_heap(page, heap);
-  page->subproc = heap->tld->subproc;
+  
   size_t page_size;
   uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);

From daac75af3611710b9631434a25fbe9f30fd11414 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 22:13:58 -0800
Subject: [PATCH 113/264] fix lock recursion

---
 ide/vs2022/mimalloc-test-stress.vcxproj |  4 +-
 include/mimalloc/atomic.h               | 27 +++++++++++--
 src/arena.c                             | 15 ++++++--
 src/init.c                              | 51 +++++++++++++------------
 4 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index fd88cd8e..672cbb87 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
+    <ProjectReference Include="mimalloc-override.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ddb5a9a3..ab1e161d 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -408,9 +408,8 @@ static inline void mi_atomic_yield(void) {
 
 // ----------------------------------------------------------------------
 // Locks 
-// These do not have to be recursive and should be light-weight 
-// in-process only locks. Only used for reserving arena's and to 
-// maintain the abandoned list.
+// These should be light-weight in-process only locks. 
+// Only used for reserving arena's and to maintain the abandoned list.
 // ----------------------------------------------------------------------
 #if _MSC_VER
 #pragma warning(disable:26110)  // unlock with holding lock
@@ -418,6 +417,26 @@ static inline void mi_atomic_yield(void) {
 
 #if defined(_WIN32)
 
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  return true;
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
+
+#if 0
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -436,7 +455,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
-
+#endif
 
 #elif defined(MI_USE_PTHREADS)
 
diff --git a/src/arena.c b/src/arena.c
index bb846da9..fd914f43 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -275,6 +275,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 }
 
 
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
+
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
@@ -325,7 +327,7 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   const bool adjust = (overcommit && arena_commit);
   if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
-  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+  int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
     if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
@@ -1162,14 +1164,14 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
 }
 
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
   mi_memid_t memid;
   void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(subproc, start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
     _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
@@ -1180,6 +1182,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   return 0;
 }
 
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex2(_mi_subproc(), size, commit, allow_large, exclusive, arena_id);
+}
+
 // Manage a range of regular OS memory
 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
   return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
@@ -1289,7 +1296,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     if (arena == NULL) break;
     mi_assert(arena->subproc == subproc);
     slice_total += arena->slice_count;
-    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc));
+    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc);
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
diff --git a/src/init.c b/src/init.c
index a15a9c6c..177ca2bd 100644
--- a/src/init.c
+++ b/src/init.c
@@ -11,30 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
-#define MI_MEMID_STATIC  {{{NULL,0}}, MI_MEM_STATIC, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  MI_ATOMIC_VAR_INIT(0), // xthread_id
-  NULL,    // free
-  0,       // used
-  0,       // capacity
-  0,       // reserved capacity
-  0,       // block size shift
-  0,       // retire_expire
-  NULL,    // local_free
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xflags
-  0,       // block_size
-  NULL,    // page_start
-  0,       // heap tag
-  false,   // is_zero
+  MI_ATOMIC_VAR_INIT(0),  // xthread_id
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // block size shift
+  0,                      // retire_expire
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  MI_ATOMIC_VAR_INIT(0),  // xflags
+  0,                      // block_size
+  NULL,                   // page_start
+  0,                      // heap tag
+  false,                  // is_zero
   #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
+  { 0, 0 },               // keys
   #endif
-  NULL,       // xheap
-  NULL, NULL, // next, prev  
-  MI_MEMID_STATIC  // memid
+  NULL,                   // xheap
+  NULL, NULL,             // next, prev  
+  MI_MEMID_STATIC         // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -100,7 +101,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main;
 static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_id
   0,                      // thread_seq
-  &subproc_main,      // subproc
+  &subproc_main,          // subproc
   NULL,                   // heap_backing
   NULL,                   // heaps list  
   0,                      // heartbeat
@@ -111,7 +112,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
 };
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  &tld_empty,         // tld
+  &tld_empty,             // tld
   NULL,                   // exclusive_arena
   0,                      // cookie
   { 0, 0 },               // keys
@@ -136,9 +137,9 @@ extern mi_heap_t heap_main;
 static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_id
   0,                      // thread_seq
-  &subproc_main,      // subproc
-  &heap_main,         // heap_backing
-  &heap_main,         // heaps list  
+  &subproc_main,          // subproc
+  &heap_main,             // heap_backing
+  &heap_main,             // heaps list  
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -147,7 +148,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
 };
 
 mi_decl_cache_align mi_heap_t heap_main = {
-  &tld_main,          // thread local data
+  &tld_main,              // thread local data
   0,                      // initial cookie
   0,                      // arena id
   { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)

From dece8a587b5cb8642c28e0aa40c850da9c30ceb4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 10:43:08 -0800
Subject: [PATCH 114/264] make stats part of a subproc

---
 ide/vs2022/mimalloc-test-stress.vcxproj |   4 +-
 include/mimalloc/atomic.h               |   6 +-
 include/mimalloc/internal.h             |   1 -
 include/mimalloc/types.h                | 126 ++++++++++++++--------
 src/alloc-aligned.c                     |   4 +-
 src/arena.c                             |  51 +++++----
 src/bitmap.c                            |   4 +-
 src/free.c                              |   2 +-
 src/heap.c                              |  20 ++--
 src/init.c                              |  89 +++++++++-------
 src/os.c                                |  30 +++---
 src/page.c                              |  12 +--
 src/stats.c                             | 136 +++++++++++++-----------
 test/test-stress.c                      |   8 +-
 14 files changed, 274 insertions(+), 219 deletions(-)

diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 672cbb87..fd88cd8e 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ab1e161d..0c7fafe3 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -417,6 +417,8 @@ static inline void mi_atomic_yield(void) {
 
 #if defined(_WIN32)
 
+#if 0
+
 #define mi_lock_t  CRITICAL_SECTION
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -436,7 +438,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
   DeleteCriticalSection(lock);
 }
 
-#if 0
+#else
+
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -455,6 +458,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
+
 #endif
 
 #elif defined(MI_USE_PTHREADS)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 24792f8c..7774b378 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -90,7 +90,6 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 void        _mi_process_load(void);
 void mi_cdecl _mi_process_done(void);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 4d43e887..ca3913ad 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -293,7 +293,7 @@ typedef struct mi_page_s {
   uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
-  mi_heap_t*                heap;              // heap this threads belong to.
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
   mi_memid_t                memid;             // provenance of the page memory
@@ -394,7 +394,7 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;                                 // thread-local data
-  mi_arena_t*           exclusive_arena;                     // if the heap belongs to a specific arena (or NULL)
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -444,18 +444,18 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 
 typedef struct mi_stats_s {
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t purged;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t giant;
-  mi_stat_count_t malloc;
+  mi_stat_count_t   pages;
+  mi_stat_count_t   reserved;
+  mi_stat_count_t   committed;
+  mi_stat_count_t   reset;
+  mi_stat_count_t   purged;
+  mi_stat_count_t   page_committed;
+  mi_stat_count_t   pages_abandoned;
+  mi_stat_count_t   threads;
+  mi_stat_count_t   normal;
+  mi_stat_count_t   huge;
+  mi_stat_count_t   giant;
+  mi_stat_count_t   malloc;
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t pages_reclaim_on_alloc;
   mi_stat_counter_t pages_reclaim_on_free;
@@ -479,37 +479,72 @@ typedef struct mi_stats_s {
 
 
 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 
 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amnt,b)  _mi_stat_adjust_increase( &(stat), amnt, b)
-#define mi_stat_adjust_decrease(stat,amnt,b)  _mi_stat_adjust_decrease( &(stat), amnt, b)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
+#define mi_debug_stat_adjust_increase_mt(stat,amnt,b)           __mi_stat_adjust_increase_mt( &(stat), amnt, b)
+#define mi_debug_stat_adjust_decrease_mt(stat,amnt,b)           __mi_stat_adjust_decrease_mt( &(stat), amnt, b)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
-#define mi_stat_adjuct_increase(stat,amnt,b)  ((void)0)
-#define mi_stat_adjust_decrease(stat,amnt,b)  ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
+#define mi_debug_stat_adjust_increase(stat,amnt,b)              ((void)0)
+#define mi_debug_stat_adjust_decrease(stat,amnt,b)              ((void)0)
 #endif
 
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+
+#define mi_debug_tld_stat_counter_increase(tld,stat,amount)     mi_debug_stat_counter_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_increase(tld,stat,amount)             mi_debug_stat_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_decrease(tld,stat,amount)             mi_debug_stat_decrease( (tld)->stats.stat, amount)
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( (heap)->tld, stat, amount)
+
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_tld_stat_decrease( (heap)->tld, stat, amount)
 
 
 // ------------------------------------------------------
 // Sub processes use separate arena's and no heaps/pages/blocks
 // are shared between sub processes. 
-// Each thread should also belong to one sub-process only
+// The subprocess structure contains essentially all static variables (except per subprocess :-))
+// 
+// Each thread should belong to one sub-process only
 // ------------------------------------------------------
 
 #define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
@@ -519,10 +554,13 @@ typedef struct mi_subproc_s {
   _Atomic(size_t)       arena_count;                    // current count of arena's
   _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
   mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
-  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process  
   mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
   mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  
   mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
 } mi_subproc_t;
 
 
@@ -535,16 +573,16 @@ typedef int64_t  mi_msecs_t;
 
 // Thread local data
 struct mi_tld_s {
-  mi_threadid_t       thread_id;        // thread id of this thread
-  size_t              thread_seq;       // thread sequence id (linear count of created threads)
-  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
-  mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
-  unsigned long long  heartbeat;        // monotonic heartbeat count
-  bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
-  bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
-  mi_stats_t          stats;            // statistics
-  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
+  unsigned long long    heartbeat;            // monotonic heartbeat count
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_stats_t            stats;                // statistics
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };
 
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 14cbee45..5da9fc0c 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -193,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
       const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
       if mi_likely(is_aligned)
       {
-        #if MI_STAT>1
-        mi_heap_stat_increase(heap, malloc, size);
-        #endif
+        mi_debug_heap_stat_increase(heap, malloc, size);
         void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
         mi_assert_internal(p != NULL);
         mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
diff --git a/src/arena.c b/src/arena.c
index fd914f43..dcff8920 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -69,10 +69,6 @@ typedef struct mi_purge_info_s {
   Arena id's
 ----------------------------------------------------------- */
 
-static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) {
-  return arena;
-}
-
 mi_arena_id_t _mi_arena_id_none(void) {
   return NULL;
 }
@@ -222,14 +218,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
       // adjust the stats so we don't double count the commits
       if (already_committed_count > 0) {
-        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
+        mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
       }
       // now actually commit
       bool commit_zero = false;
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
         // failed to commit (todo: give warning?)
         if (already_committed_count > 0) {
-          _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+          mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count));
         }
         memid->initially_committed = false;
       }
@@ -251,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       // if the OS has overcommit, and this is the first time we access these pages, then 
       // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
       if (_mi_os_has_overcommit() && touched_slices > 0) {
-        _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices));
+        mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices));
       }
     }
     // tool support
@@ -325,18 +321,18 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
   // is actually allocated for the first time it will be counted.
   const bool adjust = (overcommit && arena_commit);
-  if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
+  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
   int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
-    if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
+    if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
     const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
-    if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); }
+    if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true); }
     if (arena_reserve > small_arena_reserve) {
       // try again
       err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
-      if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back      
+      if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back      
     }
   }
   return (err==0);
@@ -579,8 +575,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
       mi_assert_internal(mi_page_is_abandoned(page));
       mi_assert_internal(mi_arena_has_page(arena,page));
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-      _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
-      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+      mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1);
+      mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1);
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
@@ -828,12 +824,13 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
     mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
-  }
-  _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
+  }  
   _mi_page_unown(page);
 }
 
@@ -850,8 +847,9 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
     return false;
   }
   else {
-    _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
-    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
+    mi_subproc_t* subproc = _mi_subproc();
+    mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1);
+    mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
     _mi_arena_page_abandon(page);
     return true;
   }
@@ -879,13 +877,14 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1);
   }
   else {
-    // page is full (or a singleton), page is OS/nly allocated
+    // page is full (or a singleton), page is OS allocated
     // nothing to do
     // TODO: maintain count of these as well?
-  }
-  _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+  }  
 }
 
 void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
@@ -1016,7 +1015,7 @@ void _mi_arena_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = NULL; }
@@ -1043,7 +1042,7 @@ static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t
     return false;
   }
 
-  _mi_stat_counter_increase(&stats->arena_count,1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1);
   mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena);
   if (arena_id != NULL) { *arena_id = arena; }
   return true;
@@ -1149,7 +1148,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main);
+  return mi_arena_add(subproc, arena, arena_id);
 }
 
 
@@ -1414,7 +1413,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 
   // update committed bitmap
   if (needs_recommit) {
-    _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
+    mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
   return needs_recommit;
@@ -1506,7 +1505,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) {
     mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend
   }
-  _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
 
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   // this also clears those ranges atomically (so any newly freed blocks will get purged next
@@ -1647,7 +1646,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
   arena->is_exclusive = true;
   arena->is_large = is_large;
   arena->subproc = NULL;
-  if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) {
+  if (!mi_arena_add(_mi_subproc(), arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/bitmap.c b/src/bitmap.c
index 6352e4ea..e4a4cc2d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -106,7 +106,9 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
   do {
     if mi_unlikely((old&mask) == 0) {
       old = mi_atomic_load_acquire(b);
-      if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
+      if ((old&mask)==0) { 
+        mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1); 
+      }
       while ((old&mask)==0) { // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
diff --git a/src/free.c b/src/free.c
index 770856da..88f784c7 100644
--- a/src/free.c
+++ b/src/free.c
@@ -242,7 +242,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
           _mi_arena_page_unabandon(page);
           _mi_heap_page_reclaim(tagheap, page);
-          _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
           return;
         }
       }
diff --git a/src/heap.c b/src/heap.c
index e8743691..d82b383f 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -141,7 +141,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  
+
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
@@ -183,9 +183,9 @@ mi_heap_t* mi_heap_get_backing(void) {
 }
 
 // todo: make order of parameters consistent (but would that break compat with CPython?)
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) 
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
 {
-  mi_assert_internal(heap!=NULL);  
+  mi_assert_internal(heap!=NULL);
   mi_memid_t memid = heap->memid;
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->memid = memid;
@@ -204,7 +204,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
       heap->full_page_retain = heap->full_page_retain / 4;
     }
   }
-  
+
   if (heap->tld->heap_backing == NULL) {
     heap->tld->heap_backing = heap;  // first heap becomes the backing heap
     _mi_random_init(&heap->random);
@@ -240,7 +240,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_assert_internal(bheap != NULL);
-  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);  
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@@ -333,17 +333,17 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, huge, bsize);
   }
-#if (MI_STAT)
+  #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
   const size_t inuse = page->used;
   if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
   }
   mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif
 
   /// pretend it is all free now
   mi_assert_internal(mi_page_thread_free(page) == NULL);
@@ -460,7 +460,7 @@ void mi_heap_delete(mi_heap_t* heap)
     // transfer still used pages to the backing heap
     mi_heap_absorb(bheap, heap);
   }
-  else 
+  else
   */
   {
     // abandon all pages
diff --git a/src/init.c b/src/init.c
index 177ca2bd..5159941a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 },               // keys
   #endif
   NULL,                   // xheap
-  NULL, NULL,             // next, prev  
+  NULL, NULL,             // next, prev
   MI_MEMID_STATIC         // memid
 };
 
@@ -103,7 +103,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   NULL,                   // heap_backing
-  NULL,                   // heaps list  
+  NULL,                   // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -139,7 +139,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   &heap_main,             // heap_backing
-  &heap_main,             // heaps list  
+  &heap_main,             // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -165,7 +165,7 @@ mi_decl_cache_align mi_heap_t heap_main = {
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  MI_MEMID_STATIC         
+  MI_MEMID_STATIC
 };
 
 
@@ -237,7 +237,7 @@ static void mi_tld_main_init(void) {
 
 // Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
-  if (heap_main.cookie == 0) {   
+  if (heap_main.cookie == 0) {
     mi_subproc_main_init();
     mi_tld_main_init();
     // heap
@@ -249,7 +249,7 @@ static void mi_heap_main_init(void) {
     #endif
     heap_main.cookie  = _mi_heap_random_next(&heap_main);
     heap_main.keys[0] = _mi_heap_random_next(&heap_main);
-    heap_main.keys[1] = _mi_heap_random_next(&heap_main);    
+    heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
@@ -266,14 +266,21 @@ mi_heap_t* heap_main_get(void) {
   Thread local data
 ----------------------------------------------------------- */
 
-// Thread sequence number
-static _Atomic(size_t) mi_tcount;
+// Count current and total created threads
+static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
+static _Atomic(size_t)  thread_total_count;
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
 
 // The mimalloc thread local data
-mi_decl_thread mi_tld_t* mi_tld;
+mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
 
 // Allocate fresh tld
 static mi_tld_t* mi_tld_alloc(void) {
+  mi_atomic_increment_relaxed(&thread_count);
   if (_mi_is_main_thread()) {
     return &tld_main;
   }
@@ -292,7 +299,7 @@ static mi_tld_t* mi_tld_alloc(void) {
     tld->heaps = NULL;
     tld->subproc = &subproc_main;
     tld->thread_id = _mi_prim_thread_id();
-    tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
     tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
     return tld;
   }
@@ -301,28 +308,38 @@ static mi_tld_t* mi_tld_alloc(void) {
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
 mi_decl_noinline static void mi_tld_free(void) {
-  mi_tld_t* tld = _mi_tld();
-  mi_tld = MI_TLD_INVALID;
-  _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  mi_tld_t* tld = _mi_tld();  
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    _mi_stats_done(&tld->stats);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  }
+  tld = MI_TLD_INVALID;
+  mi_atomic_decrement_relaxed(&thread_count);
 }
 
 mi_decl_noinline mi_tld_t* _mi_tld(void) {
-  if (mi_tld == MI_TLD_INVALID) {
-    _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n");
-    mi_tld = NULL;
+  mi_tld_t* tld = thread_tld;
+  if (tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
+    thread_tld = &tld_empty;
   }
-  if (mi_tld==NULL) {
-    mi_tld = mi_tld_alloc();
+  if (tld==&tld_empty) {
+    thread_tld = tld = mi_tld_alloc();
   }  
-  return mi_tld;
+  return tld;
 }
 
 mi_subproc_t* _mi_subproc(void) {
-  if (_mi_is_main_thread()) {  // during initialization we should not recurse over reading the _mi_tld
-    return &subproc_main;  
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL || heap == &_mi_heap_empty) {
+    return _mi_subproc_main();
   }
   else {
-    return _mi_tld()->subproc;
+    return thread_tld->subproc;  // don't call `_mi_tld()`
   }
 }
 
@@ -396,11 +413,11 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // allocates tld data 
-    // note: we cannot access thread-locals yet as that can cause (recursive) allocation 
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
     // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
-    mi_tld_t* tld = mi_tld_alloc();  
-    
+    mi_tld_t* tld = mi_tld_alloc();
+
     // allocate and initialize the heap
     mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
 
@@ -409,7 +426,7 @@ static bool _mi_thread_heap_init(void) {
     _mi_heap_set_default_direct(heap);
 
     // now that the heap is set for this thread, we can set the thread-local tld.
-    mi_tld = tld;
+    thread_tld = tld;
   }
   return false;
 }
@@ -444,9 +461,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
     _mi_heap_collect_abandon(heap);
   }
 
-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
-
   // free heap meta data
   _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
 
@@ -494,11 +508,6 @@ bool _mi_is_main_thread(void) {
   return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}
 
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@@ -511,8 +520,7 @@ void mi_thread_init(void) mi_attr_noexcept
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
   if (_mi_thread_heap_init()) return;  // returns true if already initialized
 
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);  
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
@@ -534,15 +542,14 @@ void _mi_thread_done(mi_heap_t* heap)
   }
 
   // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
   _mi_thread_heap_done(heap);  // returns true if already ran
-
+  
   // free thread local data
   mi_tld_free();
 }
@@ -654,7 +661,7 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_prim_thread_associate_default_heap(NULL);
   #endif
 
-  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)  
   mi_track_init();
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
diff --git a/src/os.c b/src/os.c
index 86ecb16b..53e8f571 100644
--- a/src/os.c
+++ b/src/os.c
@@ -114,9 +114,9 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
   if (still_committed) {
-    _mi_stat_decrease(&os_stats->committed, size);
+    mi_os_stat_decrease(committed, size);
   }
-  _mi_stat_decrease(&os_stats->reserved, size);
+  mi_os_stat_decrease(reserved, size);
 }
 
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
@@ -171,11 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  _mi_stat_counter_increase(&os_stats->mmap_calls, 1);
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&os_stats->reserved, size);
+    mi_os_stat_increase(reserved, size);
     if (commit) {
-      _mi_stat_increase(&os_stats->committed, size);
+      mi_os_stat_increase(committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -379,8 +379,8 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&os_stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&os_stats->commit_calls, 1);
+  mi_os_stat_increase(committed, size);  // use size for precise commit vs. decommit
+  mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -408,7 +408,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
 
 static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&os_stats->committed, size);
+  mi_os_stat_decrease(committed, size);
 
   // page align
   size_t csize;
@@ -440,8 +440,8 @@ bool _mi_os_reset(void* addr, size_t size) {
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&os_stats->reset, csize);
-  _mi_stat_counter_increase(&os_stats->reset_calls, 1);
+  mi_os_stat_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -460,8 +460,8 @@ bool _mi_os_reset(void* addr, size_t size) {
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&os_stats->purge_calls, 1);
-  _mi_stat_increase(&os_stats->purged, size);
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_increase(purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
     !_mi_preloading())                                     // don't decommit during preloading (unsafe)
@@ -595,8 +595,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
diff --git a/src/page.c b/src/page.c
index 0444b47e..31dbcc7d 100644
--- a/src/page.c
+++ b/src/page.c
@@ -387,9 +387,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   const size_t bsize = mi_page_block_size(page);
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
+      mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
       mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@@ -554,7 +554,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   size_t page_size;
   //uint8_t* page_start =
   mi_page_area(page, &page_size);
-  mi_heap_stat_counter_increase(heap, pages_extended, 1);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
@@ -583,7 +583,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_heap_stat_increase(heap, page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
@@ -709,8 +709,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     page = next;
   } // for each page
 
-  mi_heap_stat_counter_increase(heap, searches, count);
-
+  mi_debug_heap_stat_counter_increase(heap, searches, count);
+  
   // set the page to the best candidate
   if (page_candidate != NULL) {
     page = page_candidate;
diff --git a/src/stats.c b/src/stats.c
index bb17b936..2a395ed5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -19,88 +19,93 @@ terms of the MIT license. A copy of the license can be found in the file
   Statistics operations
 ----------------------------------------------------------- */
 
-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+  }
+  else {
+    mi_atomic_addi64_relaxed(&stat->freed, -amount);
+  }
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) stat->peak = stat->current;
+  if (amount > 0) {
+    stat->allocated += amount;
   }
   else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
+    stat->freed += -amount;
   }
 }
 
+
 // Adjust stats to compensate; for example before committing a range,
 // first adjust downwards with parts that were already committed so 
 // we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
+  if (amount == 0) return;
+  // adjust atomically 
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+}
+
 static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+  stat->current += amount;
+  if (on_alloc) {
+    stat->allocated += amount;
   }
   else {
-    // don't affect the peak
-    stat->current += amount;    
-    if (on_alloc) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += amount;
-    }
+    stat->freed += amount;
   }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->count++;
-    stat->total += amount;
-  }
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->count, 1);
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
 }
 
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->count++;
+  stat->total += amount;  
+}
+
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, (int64_t)amount);
 }
 
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }
 
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc);
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
 }
 
+
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
@@ -401,27 +406,29 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
 
 static mi_msecs_t mi_process_start; // = 0
 
-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
+// return thread local stats
+static mi_stats_t* mi_get_tld_stats(void) {
+  return &_mi_tld()->stats;
 }
 
 static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) {
+    mi_stats_add(&subproc->stats, stats);
+    _mi_memzero(stats, sizeof(mi_stats_t));
   }
 }
 
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_stats_t* stats = mi_get_tld_stats();
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
+  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
   if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
 void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+  mi_stats_merge_from( mi_get_tld_stats() );
 }
 
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
@@ -429,8 +436,8 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_stats_merge_from(mi_get_tld_stats());
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 
 void mi_stats_print(void* out) mi_attr_noexcept {
@@ -439,7 +446,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }
 
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }
 
 
@@ -473,11 +480,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc();
   mi_process_info_t pinfo;
   _mi_memzero_var(pinfo);
   pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
   pinfo.current_rss    = pinfo.current_commit;
   pinfo.peak_rss       = pinfo.peak_commit;
   pinfo.utime          = 0;
diff --git a/test/test-stress.c b/test/test-stress.c
index b35743df..0920a02e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -48,10 +48,10 @@ static int ITER    = 20;
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
-#elif 0
-static int THREADS = 64;
-static int SCALE = 400;
-static int ITER = 10;
+#elif 1
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
 #define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors

From 95aeda4cdda2431c20ed9fa3facb241b142ae773 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 10:53:34 -0800
Subject: [PATCH 115/264] merge subproc stats on delete

---
 include/mimalloc/internal.h |  1 +
 src/init.c                  |  4 ++++
 src/stats.c                 | 23 +++++++++++------------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 7774b378..e316de94 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -203,6 +203,7 @@ void        _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
 
 // "stats.c"
 void        _mi_stats_done(mi_stats_t* stats);
+void        _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
diff --git a/src/init.c b/src/init.c
index 5159941a..3af4f4ef 100644
--- a/src/init.c
+++ b/src/init.c
@@ -382,6 +382,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
     mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
+
+  // merge stats back into the main subproc?
+  _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats);
+
   // safe to release
   // todo: should we refcount subprocesses?
   mi_lock_done(&subproc->os_pages_lock);
diff --git a/src/stats.c b/src/stats.c
index 2a395ed5..102373ec 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -411,14 +411,6 @@ static mi_stats_t* mi_get_tld_stats(void) {
   return &_mi_tld()->stats;
 }
 
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  mi_subproc_t* subproc = _mi_subproc();
-  if (stats != &subproc->stats) {
-    mi_stats_add(&subproc->stats, stats);
-    _mi_memzero(stats, sizeof(mi_stats_t));
-  }
-}
-
 void mi_stats_reset(void) mi_attr_noexcept {
   mi_stats_t* stats = mi_get_tld_stats();
   mi_subproc_t* subproc = _mi_subproc();
@@ -427,16 +419,23 @@ void mi_stats_reset(void) mi_attr_noexcept {
   if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_get_tld_stats() );
+void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) {
+  if (to != from) {
+    mi_stats_add(to, from);
+    _mi_memzero(from, sizeof(mi_stats_t));
+  }
 }
 
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+  _mi_stats_merge_from(&_mi_subproc()->stats, stats);
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  _mi_stats_done( mi_get_tld_stats() );
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_get_tld_stats());
+  mi_stats_merge();
   _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 

From 4ad7fedd25e0869aa6fbca2aa24fe08dd4eebc39 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 11:35:30 -0800
Subject: [PATCH 116/264] track os abandoned pages in a list

---
 include/mimalloc/atomic.h | 25 ++++++++---------
 include/mimalloc/types.h  |  4 +--
 src/arena-meta.c          |  7 +++--
 src/arena.c               | 56 ++++++++++++++++++++++++++-------------
 src/init.c                | 11 ++++----
 5 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 0c7fafe3..fcd9efba 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -415,6 +415,8 @@ static inline void mi_atomic_yield(void) {
 #pragma warning(disable:26110)  // unlock with holding lock
 #endif
 
+#define mi_lock(lock)    for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+
 #if defined(_WIN32)
 
 #if 0
@@ -424,9 +426,8 @@ static inline void mi_atomic_yield(void) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryEnterCriticalSection(lock);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   EnterCriticalSection(lock);
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   LeaveCriticalSection(lock);
@@ -445,9 +446,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryAcquireSRWLockExclusive(lock);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   AcquireSRWLockExclusive(lock);
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   ReleaseSRWLockExclusive(lock);
@@ -468,8 +468,11 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return (pthread_mutex_trylock(lock) == 0);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(lock);
+  if (err != 0) {
+    mi_error_message(EFAULT, "internal error: lock cannot be acquired\n");
+  }
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   pthread_mutex_unlock(lock);
@@ -489,9 +492,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return lock->try_lock();
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   lock->lock();
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   lock->unlock();
@@ -514,12 +516,11 @@ static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
-    if (mi_lock_try_acquire(lock)) return true;
+    if (mi_lock_try_acquire(lock)) return;
     mi_atomic_yield();
-  }
-  return true;
+  }  
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   mi_atomic_store_release(lock, (uintptr_t)0);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ca3913ad..59393848 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -556,8 +556,8 @@ typedef struct mi_subproc_s {
   mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
 
   _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process  
-  mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
-  mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  mi_page_t*            os_abandoned_pages;             // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
   
   mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
   mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
diff --git a/src/arena-meta.c b/src/arena-meta.c
index f28c50e9..a5dc8e75 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -64,12 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
-  // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the 
-  // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet).
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
   mi_memid_t memid;
-  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
+  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
                                                                    true /* commit*/, true /* allow large */,
-                                                                   NULL, 0 /* tseq */, &memid );
+                                                                   NULL /* req arena */, 0 /* thread_seq */, &memid);
   if (mpage == NULL) return NULL;
   mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
diff --git a/src/arena.c b/src/arena.c
index dcff8920..c4b02cf6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -439,24 +439,20 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
   const size_t arena_count = mi_arenas_get_count(subproc);
-  if (mi_lock_acquire(&subproc->arena_reserve_lock)) {
-    bool ok = true;
+  mi_lock(&subproc->arena_reserve_lock) {    
     if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
     }
     else {
       // another thread already reserved a new arena
     }
-    mi_lock_release(&subproc->arena_reserve_lock);
-    if (ok) {
-      // try once more to allocate in the new arena
-      mi_assert_internal(req_arena == NULL);
-      p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
-      if (p != NULL) return p;
-    }
-  }
+  }  
+  // try once more to allocate in the new arena
+  mi_assert_internal(req_arena == NULL);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
+  if (p != NULL) return p;    
 
   return NULL;
 }
@@ -685,11 +681,13 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   else {
     page->block_size_shift = 0;
   }
+  // and own it
+  mi_page_try_claim_ownership(page);
+
+  // register in the page map
   _mi_page_map_register(page);
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
-
-  mi_page_try_claim_ownership(page);
   mi_assert_internal(mi_page_block_size(page) == block_size);
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(mi_page_is_owned(page));
@@ -771,7 +769,8 @@ void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(page->next==NULL);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(page->next==NULL && page->prev==NULL);
 
   #if MI_DEBUG>1
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
@@ -790,6 +789,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   }
   #endif
 
+  // unregister page
   _mi_page_map_unregister(page);
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
@@ -807,7 +807,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(!mi_page_all_free(page));
-  mi_assert_internal(page->next==NULL);
+  mi_assert_internal(page->next==NULL && page->prev == NULL);
 
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     // make available for allocations
@@ -827,8 +827,19 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
   }
   else {
-    // page is full (or a singleton), page is OS/externally allocated
+    // page is full (or a singleton), or the page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
+    mi_subproc_t* subproc = _mi_subproc();
+    // but for non-arena pages, add to the subproc list so these can be visited
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&subproc->os_abandoned_pages_lock) {
+        // push in front
+        page->prev = NULL;
+        page->next = subproc->os_abandoned_pages;
+        if (page->next != NULL) { page->next->prev = page; }
+        subproc->os_abandoned_pages = page;
+      }
+    }
     mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
   }  
   _mi_page_unown(page);
@@ -881,9 +892,18 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
   }
   else {
     // page is full (or a singleton), page is OS allocated
-    // nothing to do
-    // TODO: maintain count of these as well?
+    mi_subproc_t* subproc = _mi_subproc();
     mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+    // if not an arena page, remove from the subproc os pages list
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&subproc->os_abandoned_pages_lock) {
+        if (page->prev != NULL) { page->prev->next = page->next; }
+        if (page->next != NULL) { page->next->prev = page->prev; }
+        if (subproc->os_abandoned_pages == page) { subproc->os_abandoned_pages = page->next; }
+        page->next = NULL;
+        page->prev = NULL;
+      }
+    }
   }  
 }
 
diff --git a/src/init.c b/src/init.c
index 3af4f4ef..1968ef68 100644
--- a/src/init.c
+++ b/src/init.c
@@ -223,7 +223,7 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 static void mi_subproc_main_init(void) {
   if (subproc_main.memid.memkind != MI_MEM_STATIC) {
     subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
-    mi_lock_init(&subproc_main.os_pages_lock);
+    mi_lock_init(&subproc_main.os_abandoned_pages_lock);
     mi_lock_init(&subproc_main.arena_reserve_lock);
   }
 }
@@ -361,7 +361,7 @@ mi_subproc_id_t mi_subproc_new(void) {
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
   if (subproc == NULL) return NULL;
   subproc->memid = memid;
-  mi_lock_init(&subproc->os_pages_lock);
+  mi_lock_init(&subproc->os_abandoned_pages_lock);
   mi_lock_init(&subproc->arena_reserve_lock);
   return subproc;
 }
@@ -375,11 +375,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
   // check if there are os pages still..
   bool safe_to_delete = false;
-  if (mi_lock_acquire(&subproc->os_pages_lock)) {
-    if (subproc->os_pages.first == NULL) {
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    if (subproc->os_abandoned_pages == NULL) {
       safe_to_delete = true;
     }
-    mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
 
@@ -388,7 +387,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
 
   // safe to release
   // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->os_pages_lock);
+  mi_lock_done(&subproc->os_abandoned_pages_lock);
   mi_lock_done(&subproc->arena_reserve_lock);
   _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }

From 89b0d5a357af02809509544f83c92e7f5be11a3f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 11:53:29 -0800
Subject: [PATCH 117/264] allocate heaps associated with an arena in that arena

---
 include/mimalloc/internal.h | 11 ++++++-----
 include/mimalloc/types.h    | 21 ++++++---------------
 src/arena-meta.c            |  5 +----
 src/arena.c                 |  6 ++----
 src/heap.c                  | 14 +++++++++++---
 src/init.c                  | 10 +++++-----
 6 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e316de94..208989e3 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,6 +147,7 @@ mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
 
 void*       _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void*       _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void        _mi_arena_free(void* p, size_t size, mi_memid_t memid);
 bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 bool        _mi_arena_contains(const void* p);
 void        _mi_arenas_collect(bool force_purge);
@@ -421,11 +422,11 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   return (heap != &_mi_heap_empty);
 }
 
-static inline uintptr_t _mi_ptr_cookie(const void* p) {
-  extern mi_heap_t _mi_heap_main;
-  mi_assert_internal(_mi_heap_main.cookie != 0);
-  return ((uintptr_t)p ^ _mi_heap_main.cookie);
-}
+//static inline uintptr_t _mi_ptr_cookie(const void* p) {
+//  extern mi_heap_t _mi_heap_main;
+//  mi_assert_internal(_mi_heap_main.cookie != 0);
+//  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+//}
 
 
 /* -----------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 59393848..461b5393 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -396,7 +396,6 @@ struct mi_heap_s {
   mi_tld_t*             tld;                                 // thread-local data
   mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
@@ -522,21 +521,13 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 #define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
 
-#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
-#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
-#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease( &(heap)->tld->stats.stat, amount)
 
-#define mi_debug_tld_stat_counter_increase(tld,stat,amount)     mi_debug_stat_counter_increase( (tld)->stats.stat, amount)
-#define mi_debug_tld_stat_increase(tld,stat,amount)             mi_debug_stat_increase( (tld)->stats.stat, amount)
-#define mi_debug_tld_stat_decrease(tld,stat,amount)             mi_debug_stat_decrease( (tld)->stats.stat, amount)
-
-#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase((heap)->tld, stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( (heap)->tld, stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( (heap)->tld, stat, amount)
-
-#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount)
-#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_tld_stat_increase( (heap)->tld, stat, amount)
-#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_tld_stat_decrease( (heap)->tld, stat, amount)
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_stat_decrease( (heap)->tld->stats.stat, amount)
 
 
 // ------------------------------------------------------
diff --git a/src/arena-meta.c b/src/arena-meta.c
index a5dc8e75..065a1331 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -148,11 +148,8 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
     mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL);
   }
-  else if (mi_memid_is_os(memid)) {
-    _mi_os_free(p, size, memid);    
-  }
   else {
-    mi_assert_internal(mi_memid_needs_no_free(memid));
+    _mi_arena_free(p,size,memid);
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index c4b02cf6..869cba49 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -762,8 +762,6 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
   return page;
 }
 
-static void mi_arena_free(void* p, size_t size, mi_memid_t memid);
-
 void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
@@ -794,7 +792,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
   }
-  mi_arena_free(page, mi_memid_size(page->memid), page->memid);
+  _mi_arena_free(page, mi_memid_size(page->memid), page->memid);
 }
 
 /* -----------------------------------------------------------
@@ -920,7 +918,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
 static void mi_arenas_try_purge(bool force, bool visit_all);
 
-static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
+void _mi_arena_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
   if (size==0) return;
 
diff --git a/src/heap.c b/src/heap.c
index d82b383f..f47aaad9 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -213,8 +213,8 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
     _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
   }
   heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
+  //heap->keys[0] = _mi_heap_random_next(heap);
+  //heap->keys[1] = _mi_heap_random_next(heap);*/
   _mi_heap_guarded_init(heap);
 
   // push on the thread local heaps list
@@ -227,7 +227,15 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   mi_assert(heap_tag >= 0 && heap_tag < 256);
   // allocate and initialize a heap
   mi_memid_t memid;
-  mi_heap_t* heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  mi_heap_t* heap; 
+  if (arena_id == _mi_arena_id_none()) {
+    heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  }
+  else {
+    // heaps associated wita a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+  }
   if (heap==NULL) {
     _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
     return NULL;
diff --git a/src/init.c b/src/init.c
index 1968ef68..2f147e55 100644
--- a/src/init.c
+++ b/src/init.c
@@ -115,7 +115,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   &tld_empty,             // tld
   NULL,                   // exclusive_arena
   0,                      // cookie
-  { 0, 0 },               // keys
+  //{ 0, 0 },               // keys
   { {0}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
@@ -149,9 +149,9 @@ static mi_decl_cache_align mi_tld_t tld_main = {
 
 mi_decl_cache_align mi_heap_t heap_main = {
   &tld_main,              // thread local data
+  NULL,                   // exclusive arena
   0,                      // initial cookie
-  0,                      // arena id
-  { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
   { {0x846ca68b}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
@@ -248,8 +248,8 @@ static void mi_heap_main_init(void) {
       _mi_random_init(&heap_main.random);
     #endif
     heap_main.cookie  = _mi_heap_random_next(&heap_main);
-    heap_main.keys[0] = _mi_heap_random_next(&heap_main);
-    heap_main.keys[1] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);

From 7d46478a5f7c16b078b7955df95d3801eb1d585d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 13:19:06 -0800
Subject: [PATCH 118/264] add initial load/unload for heaps

---
 include/mimalloc.h |  8 ++++-
 src/arena.c        | 22 +++++++-----
 src/heap.c         | 83 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 7a58e54c..b0a20e9e 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -326,7 +326,13 @@ mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min,
 //mi_decl_export void  mi_os_decommit(void* p, size_t size);
 
 mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
-mi_decl_export bool  mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id); 
+mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id); 
+mi_decl_export bool  mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);
+mi_decl_export void  mi_heap_unload(mi_heap_t* heap);
+
+// Is a pointer contained in the given arena area?
+mi_decl_export bool  mi_arena_contains(mi_arena_id_t arena_id, const void* p);
+
 
 // ------------------------------------------------------
 // Convenience
diff --git a/src/arena.c b/src/arena.c
index 869cba49..aa3c9175 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -492,7 +492,6 @@ void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&           // is arena allocation allowed?
-      req_arena == NULL &&                                               // not a specific arena?
       size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
       alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
@@ -980,13 +979,21 @@ void _mi_arenas_collect(bool force_purge) {
   mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
 }
 
+
+// Is a pointer contained in the given arena area?
+bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  return (mi_arena_start(arena) <= (const uint8_t*)p &&
+          mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p);
+}
+
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
   mi_subproc_t* subproc = _mi_subproc();
   const size_t max_arena = mi_arenas_get_count(subproc);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) {
+    if (arena != NULL && mi_arena_contains(arena,p)) {
       return true;
     }
   }
@@ -1636,7 +1643,7 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   return true;
 }
 
-mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) {
+mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id) {
   // assume the memory area is already containing the arena
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
   if (start == NULL || size == 0) return false;
@@ -1658,13 +1665,10 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
     _mi_warning_message("the reloaded arena is not exclusive\n");
     return false;
   }
-  arena->memid.is_pinned = is_large;
-  arena->memid.initially_committed = is_committed;
-  arena->memid.initially_zero = is_zero;
+  
   arena->is_exclusive = true;
-  arena->is_large = is_large;
-  arena->subproc = NULL;
-  if (!mi_arena_add(_mi_subproc(), arena, arena_id)) {
+  arena->subproc = _mi_subproc();
+  if (!mi_arena_add(arena->subproc, arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/heap.c b/src/heap.c
index f47aaad9..03030b47 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -234,7 +234,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   else {
     // heaps associated wita a specific arena are allocated in that arena
     // note: takes up at least one slice which is quite wasteful...
-    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
   }
   if (heap==NULL) {
     _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
@@ -280,7 +280,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
 }
 
 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
+static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) {
   mi_assert(heap != NULL);
   mi_assert_internal(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
@@ -307,7 +307,9 @@ static void mi_heap_free(mi_heap_t* heap) {
   mi_assert_internal(heap->tld->heaps != NULL);
 
   // and free the used memory
-  _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  if (do_free_mem) {
+    _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  }
 }
 
 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@@ -403,7 +405,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
     #endif
     // free all pages
     _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+    mi_heap_free(heap,true);
   }
   #endif
 }
@@ -462,20 +464,11 @@ void mi_heap_delete(mi_heap_t* heap)
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
-  /*
-  mi_heap_t* bheap = heap->tld->heap_backing;
-  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
-    // transfer still used pages to the backing heap
-    mi_heap_absorb(bheap, heap);
-  }
-  else
-  */
-  {
-    // abandon all pages
-    _mi_heap_collect_abandon(heap);
-  }
+  // abandon all pages
+  _mi_heap_collect_abandon(heap);
+  
   mi_assert_internal(heap->page_count==0);
-  mi_heap_free(heap);
+  mi_heap_free(heap,true);
 }
 
 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
@@ -489,7 +482,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 }
 
 
+/* -----------------------------------------------------------
+  Load/unload heaps
+----------------------------------------------------------- */
+void mi_heap_unload(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
+    return;
+  }
+  
+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_heap_collect_abandon(heap);
+  mi_assert_internal(heap->page_count==0);
 
+  // remove from heap list
+  mi_heap_free(heap, false /* but don't actually free the memory */);
+
+  // disassociate from the current thread-local and static state
+  heap->tld = NULL;
+  return;
+}
+
+bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
+  mi_assert(mi_heap_is_initialized(heap));  
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (heap->tld != NULL) {
+    _mi_warning_message("cannot reload heaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (heap->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(heap->page_count==0);
+
+  // re-associate from the current thread-local and static state
+  heap->tld = _mi_tld();
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(heap->page_count == 0);
+  for (int i = 0; i < MI_PAGES_DIRECT; i++) {
+    heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return true;
+}
 
 /* -----------------------------------------------------------
   Analysis

From 108c84e858b7ee2aa2fd3f00de03afb879e89718 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 14:45:14 -0800
Subject: [PATCH 119/264] remove req_arena parameter to arena_reserve

---
 src/arena.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index aa3c9175..af1f737e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -274,11 +274,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t* arena_id)
 {
-  // if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
   const size_t arena_count = mi_arenas_get_count(subproc);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -443,7 +440,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
     if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, &arena_id);
     }
     else {
       // another thread already reserved a new arena

From c138fba149d358465345ce0316c42d626afe1328 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:49:17 -0800
Subject: [PATCH 120/264] merge from dev

---
 src/arena-abandon.c | 346 --------------------------------------------
 1 file changed, 346 deletions(-)
 delete mode 100644 src/arena-abandon.c

diff --git a/src/arena-abandon.c b/src/arena-abandon.c
deleted file mode 100644
index 460c80fc..00000000
--- a/src/arena-abandon.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-#if !defined(MI_IN_ARENA_C)
-#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
-// add includes help an IDE
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-#endif
-
-// Minimal exports for arena-abandoned.
-size_t      mi_arena_id_index(mi_arena_id_t id);
-mi_arena_t* mi_arena_from_index(size_t idx);
-size_t      mi_arena_get_count(void);
-void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
-bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
-
-/* -----------------------------------------------------------
-  Abandoned blocks/segments:
-
-  _mi_arena_segment_clear_abandoned
-  _mi_arena_segment_mark_abandoned
-
-  This is used to atomically abandon/reclaim segments
-  (and crosses the arena API but it is convenient to have here).
-
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment.
-
-  Abandoned segments are atomically marked in the `block_abandoned`
-  bitmap of arenas. Any segments allocated outside arenas are put
-  in the sub-process `abandoned_os_list`. This list is accessed
-  using locks but this should be uncommon and generally uncontended.
-  Reclaim and visiting either scan through the `block_abandoned`
-  bitmaps of the arena's, or visit the `abandoned_os_list`
-
-  A potentially nicer design is to use arena's for everything
-  and perhaps have virtual arena's to map OS allocated memory
-  but this would lack the "density" of our current arena's. TBC.
------------------------------------------------------------ */
-
-
-// reclaim a specific OS abandoned segment; `true` on success.
-// sets the thread_id.
-static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena, remove from list of abandoned os segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
-    return false;  // failed to acquire the lock, we just give up
-  }
-  // remove atomically from the abandoned os list (if possible!)
-  bool reclaimed = false;
-  mi_segment_t* const next = segment->abandoned_os_next;
-  mi_segment_t* const prev = segment->abandoned_os_prev;
-  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
-    #if MI_DEBUG>3
-    // find ourselves in the abandoned list (and check the count)
-    bool found = false;
-    size_t count = 0;
-    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
-      if (current == segment) { found = true; }
-      count++;
-    }
-    mi_assert_internal(found);
-    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
-    #endif
-    // remove (atomically) from the list and reclaim
-    if (prev != NULL) { prev->abandoned_os_next = next; }
-    else { subproc->abandoned_os_list = next; }
-    if (next != NULL) { next->abandoned_os_prev = prev; }
-    else { subproc->abandoned_os_list_tail = prev; }
-    segment->abandoned_os_next = NULL;
-    segment->abandoned_os_prev = NULL;
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
-    if (take_lock) { // don't reset the thread_id when iterating
-      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-    }
-    reclaimed = true;
-  }
-  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
-  return reclaimed;
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // reclaim atomically
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) {
-    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-
-// mark a specific OS segment as abandoned
-static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena; we use a list of abandoned segments
-  mi_subproc_t* const subproc = segment->subproc;
-  mi_lock(&subproc->abandoned_os_lock) {
-    // push on the tail of the list (important for the visitor)
-    mi_segment_t* prev = subproc->abandoned_os_list_tail;
-    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
-    mi_assert_internal(segment->abandoned_os_prev == NULL);
-    mi_assert_internal(segment->abandoned_os_next == NULL);
-    if (prev != NULL) { prev->abandoned_os_next = segment; }
-    else { subproc->abandoned_os_list = segment; }
-    subproc->abandoned_os_list_tail = segment;
-    segment->abandoned_os_prev = prev;
-    segment->abandoned_os_next = NULL;
-    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count);
-    // and release the lock
-  }
-  return;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
-{
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    mi_arena_segment_os_mark_abandoned(segment);
-    return;
-  }
-  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // set abandonment atomically
-  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-
-/* -----------------------------------------------------------
-  Iterate through the abandoned blocks/segments using a cursor.
-  This is used for reclaiming and abandoned block visiting.
------------------------------------------------------------ */
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  current->bitmap_idx = 0;
-  current->subproc = subproc;
-  current->visit_all = visit_all;
-  current->hold_visit_lock = false;
-  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
-  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
-  const size_t max_arena = mi_arena_get_count();
-  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
-    // for a heap that is bound to one arena, only visit that arena
-    current->start = mi_arena_id_index(heap->arena_id);
-    current->end = current->start + 1;
-    current->os_list_count = 0;
-  }
-  else {
-    // otherwise visit all starting at a random location
-    if (abandoned_count > abandoned_list_count && max_arena > 0) {
-      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
-      current->end = current->start + max_arena;
-    }
-    else {
-      current->start = 0;
-      current->end = 0;
-    }
-    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
-  }
-  mi_assert_internal(current->start <= max_arena);
-}
-
-void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
-  if (current->hold_visit_lock) {
-    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
-    current->hold_visit_lock = false;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
-  // try to reclaim an abandoned segment in the arena atomically
-  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  // check that the segment belongs to our sub-process
-  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
-  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
-  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
-  if (segment->subproc != subproc) {
-    // it is from another sub-process, re-mark it and continue searching
-    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
-    return NULL;
-  }
-  else {
-    // success, we unabandoned a segment in our sub-process
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    return segment;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
-  const size_t max_arena = mi_arena_get_count();
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
-  // visit arena's (from the previous cursor)
-  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
-    // index wraps around
-    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
-    mi_arena_t* arena = mi_arena_from_index(arena_idx);
-    if (arena != NULL) {
-      bool has_lock = false;
-      // visit the abandoned fields (starting at previous_idx)
-      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // we only take the arena lock if there are actually abandoned segments present
-          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
-            if (!has_lock) {
-              if (previous->visit_all) {
-                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
-              }
-              // skip to next arena
-              break;
-            }
-          }
-          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
-              if (segment != NULL) {
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
-                return segment;
-              }
-            }
-          }
-        }
-      }
-      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-    }
-  }
-  return NULL;
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
-  // go through the abandoned_os_list
-  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
-  // The lock is released when the cursor is released.
-  if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
-                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
-    if (!previous->hold_visit_lock) {
-      if (previous->visit_all) {
-        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
-      }
-      return NULL; // we cannot get the lock, give up
-    }
-  }
-  // One list entry at a time
-  while (previous->os_list_count > 0) {
-    previous->os_list_count--;
-    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    mi_segment_t* segment = previous->subproc->abandoned_os_list;
-    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-      mi_lock_release(&previous->subproc->abandoned_os_lock);
-      return segment;
-    }
-    // already abandoned, try again
-    mi_lock_release(&previous->subproc->abandoned_os_lock);
-  }
-  // done
-  mi_assert_internal(previous->os_list_count == 0);
-  return NULL;
-}
-
-
-// reclaim abandoned segments
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
-  if (previous->start < previous->end) {
-    // walk the arena
-    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
-    if (segment != NULL) { return segment; }
-  }
-  // no entries in the arena's anymore, walk the abandoned OS list
-  mi_assert_internal(previous->start == previous->end);
-  return mi_arena_segment_clear_abandoned_next_list(previous);
-}
-
-
-bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  // (unfortunately) the visit_abandoned option must be enabled from the start.
-  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
-  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
-    return false;
-  }
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
-  mi_segment_t* segment;
-  bool ok = true;
-  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
-    _mi_arena_segment_mark_abandoned(segment);
-  }
-  _mi_arena_field_cursor_done(&current);
-  return ok;
-}

From da17a59bdb127e1bd5fdd1ecc3dbf8153e1ed4db Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:53:50 -0800
Subject: [PATCH 121/264] re-add deferred free and heap retired collect

---
 include/mimalloc/types.h |  1 +
 src/init.c               |  2 ++
 src/page.c               | 13 ++++++++-----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 0b084558..7009a017 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -400,6 +400,7 @@ struct mi_heap_s {
   size_t                page_count;                          // total number of pages in the `pages` queues.
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  size_t                generic_count;                       // how often is mimalloc_generic invoked?
   mi_heap_t*            next;                                // list of heaps per thread
   long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
diff --git a/src/init.c b/src/init.c
index 6bbea58e..5f3fb797 100644
--- a/src/init.c
+++ b/src/init.c
@@ -119,6 +119,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   { {0}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // generic count
   NULL,                   // next
   0,                      // full page retain
   false,                  // can reclaim
@@ -155,6 +156,7 @@ mi_decl_cache_align mi_heap_t heap_main = {
   { {0x846ca68b}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // generic count
   NULL,                   // next heap
   2,                      // full page retain
   true,                   // allow page reclaim
diff --git a/src/page.c b/src/page.c
index 31dbcc7d..c366439e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -872,11 +872,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
 
-  // call potential deferred free routines
-  // _mi_deferred_free(heap, false);
-
-  // free delayed frees from other threads (but skip contended ones)
-  // _mi_heap_delayed_free_partial(heap);
+  // collect every N generic mallocs
+  if (heap->generic_count++ > 10000) {
+    heap->generic_count = 0;
+    // call potential deferred free routines
+    _mi_deferred_free(heap, false);
+    // collect retired pages
+    _mi_heap_collect_retired(heap, false);
+  }
 
   // find (or allocate) a page of the right size
   mi_page_t* page = mi_find_page(heap, size, huge_alignment);

From d7d626cbfae73e22ab85d92a12feb76b9bf8f981 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 16:24:56 -0800
Subject: [PATCH 122/264] enable collecting from the full page queue

---
 src/heap.c | 23 -----------------------
 src/page.c | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index 03030b47..412c6465 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -102,14 +102,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
   return true; // don't break
 }
 
-//static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-//  MI_UNUSED(arg1);
-//  MI_UNUSED(arg2);
-//  MI_UNUSED(heap);
-//  MI_UNUSED(pq);
-//  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-//  return true; // don't break
-//}
 
 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
@@ -121,21 +113,6 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   // python/cpython#112532: we may be called from a thread that is not the owner of the heap
   // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
 
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  //if (
-  //#ifdef NDEBUG
-  //    collect == MI_FORCE
-  //#else
-  //    collect >= MI_FORCE
-  //#endif
-  //  && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim)
-  //{
-  //  // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-  //  // if all memory is freed by now, all segments should be freed.
-  //  // note: this only collects in the current subprocess
-  //  _mi_arena_reclaim_all_abandoned(heap);
-  //}
-
   // collect retired pages
   _mi_heap_collect_retired(heap, force);
 
diff --git a/src/page.c b/src/page.c
index c366439e..200cdaa9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -433,6 +433,36 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 }
 
 
+static void mi_heap_collect_full_pages(mi_heap_t* heap) {
+  // note: normally full pages get immediately abandoned and the full queue is always empty
+  // this path is only used if abandoning is disabled due to a destroy-able heap or options
+  // set by the user.
+  mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL];
+  for (mi_page_t* page = pq->first; page != NULL; ) {
+    mi_page_t* next = page->next;         // get next in case we free the page
+    _mi_page_free_collect(page, false);   // register concurrent free's
+    // no longer full?
+    if (!mi_page_is_full(page)) {
+      if (mi_page_all_free(page)) {
+        _mi_page_free(page, pq);
+      }
+      else {
+        _mi_page_unfull(page);
+      }
+    }
+    page = next;
+  }
+}
+
+static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) {
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+  // collect retired pages
+  _mi_heap_collect_retired(heap, false);
+  // collect full pages that had concurrent free's
+  mi_heap_collect_full_pages(heap);
+}
+
 /* -----------------------------------------------------------
   Initialize the initial free list in a page.
   In secure mode we initialize a randomized list by
@@ -857,6 +887,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   }
 }
 
+
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 // The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
@@ -873,17 +904,15 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_heap_is_initialized(heap));
 
   // collect every N generic mallocs
-  if (heap->generic_count++ > 10000) {
+  if mi_unlikely(heap->generic_count++ > 10000) {
     heap->generic_count = 0;
-    // call potential deferred free routines
-    _mi_deferred_free(heap, false);
-    // collect retired pages
-    _mi_heap_collect_retired(heap, false);
+    mi_heap_generic_collect(heap);
   }
 
   // find (or allocate) a page of the right size
   mi_page_t* page = mi_find_page(heap, size, huge_alignment);
   if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
+    mi_heap_generic_collect(heap);
     mi_heap_collect(heap, true /* force */);
     page = mi_find_page(heap, size, huge_alignment);
   }

From 1e2221f5126fa3686cff9fd656842cf35059b4e6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 19:28:53 -0800
Subject: [PATCH 123/264] fix signed/unsigned; fix heap_destroy assert failure

---
 src/heap.c     |  3 ++-
 src/page-map.c | 13 +++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index 412c6465..a1b06c6b 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -340,6 +340,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   // mi_page_free(page,false);
   page->next = NULL;
   page->prev = NULL;
+  mi_page_set_heap(page, NULL);
   _mi_arena_page_free(page);
 
   return true; // keep going
@@ -507,7 +508,7 @@ bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
 
   // reinit direct pages (as we may be in a different process)
   mi_assert_internal(heap->page_count == 0);
-  for (int i = 0; i < MI_PAGES_DIRECT; i++) {
+  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
     heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
   }
 
diff --git a/src/page-map.c b/src/page-map.c
index 7b74c711..d6517f72 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -9,6 +9,14 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 
+// The page-map contains a byte for each 64kb slice in the address space. 
+// For an address `a` where `n = _mi_page_map[a >> 16]`:
+// 0 = unused
+// 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
+// 1 < n << 127 = the slice is part of a page, starting at `(((a>>16) - n - 1) << 16)`.
+// 
+// 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space.
+// 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)).
 mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
@@ -24,10 +32,11 @@ bool _mi_page_map_init(void) {
   size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);  
   if (vbits == 0) {
     vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64
     if (vbits >= 48) { vbits = 47; }
+    #endif
   }
-  // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
-  //                    64 KiB for 4 GiB address space (on 32-bit)
+  
   mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
 

From 56cbddfc7e39ec0a4ea7585641bf333495b83604 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 23:08:52 -0800
Subject: [PATCH 124/264] initial work on a two-level page-map

---
 include/mimalloc/bits.h     |   8 ++
 include/mimalloc/internal.h |  64 +++++++++++++---
 src/page-map.c              | 143 +++++++++++++++++++++++++++++++++++-
 test/test-stress.c          |   4 +-
 4 files changed, 206 insertions(+), 13 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 32b9d528..fb6c2e8c 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -68,6 +68,14 @@ typedef int32_t  mi_ssize_t;
 #define MI_MiB     (MI_KiB*MI_KiB)
 #define MI_GiB     (MI_MiB*MI_KiB)
 
+#if MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#define MI_PAGE_MAP_FLAT  0
+#else
+#define MI_MAX_VABITS     (32)
+#define MI_PAGE_MAP_FLAT  1
+#endif
+
 
 /* --------------------------------------------------------------------------------
   Architecture
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 208989e3..dbc45133 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -422,6 +422,14 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   return (heap != &_mi_heap_empty);
 }
 
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+
 //static inline uintptr_t _mi_ptr_cookie(const void* p) {
 //  extern mi_heap_t _mi_heap_main;
 //  mi_assert_internal(_mi_heap_main.cookie != 0);
@@ -433,14 +441,9 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   Pages
 ----------------------------------------------------------- */
 
-static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
-  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
-  const size_t idx = _mi_wsize_from_size(size);
-  mi_assert_internal(idx < MI_PAGES_DIRECT);
-  return heap->pages_free_direct[idx];
-}
-
+#if MI_PAGE_MAP_FLAT
 
+// flat page-map committed on demand
 extern uint8_t* _mi_page_map;
 
 static inline uintptr_t _mi_page_map_index(const void* p) {
@@ -465,16 +468,59 @@ static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
 
 static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
   bool valid;
-  mi_page_t* const page = _mi_ptr_page_ex(p,&valid);
+  mi_page_t* const page = _mi_ptr_page_ex(p, &valid);
   return (valid ? page : NULL);
 }
 
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  return _mi_ptr_page_ex(p, NULL);
+}
+
+#else 
+
+// 2-level page map
+ 
+// one page-map directory = 64 KiB => covers 2^16 * 2^16 = 2^32 = 4 GiB address space
+// the page-map needs 48-16-16 = 16 bits => 2^16 map directories = 2^16 * 2^3 = 2^19 = 512 KiB size.
+// we commit the page-map directories on-demand. (2^16 * 2^16 = 2^32 ~= 4 GiB needed to cover 256 TeB)
+
+#define MI_PAGE_MAP_SUB_SHIFT     (16)    // 64 KiB 
+#define MI_PAGE_MAP_SUB_SIZE      (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
+#define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
+#define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
+
+extern uint8_t** _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
+  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
+  if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_SIZE; }
+  return (size_t)(u / MI_PAGE_MAP_COUNT);
+}
+
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
+  const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT];
+  const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE];
+  return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE);
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
+  const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT];
+  //if mi_unlikely(sub == NULL) { return NULL; }
+  const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE];
+  //if mi_unlikely(ofs == 0) { return NULL;  }
+  return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE);
+}
+
+#endif
+
 static inline mi_page_t* _mi_ptr_page(const void* p) {
   mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
   #if MI_DEBUG || defined(__APPLE__)
   return _mi_checked_ptr_page(p);
   #else
-  return _mi_ptr_page_ex(p,NULL);
+  return _mi_unchecked_ptr_page(p);
   #endif
 }
 
diff --git a/src/page-map.c b/src/page-map.c
index d6517f72..a814610f 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -9,6 +9,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 
+#if MI_PAGE_MAP_FLAT
+
 // The page-map contains a byte for each 64kb slice in the address space. 
 // For an address `a` where `n = _mi_page_map[a >> 16]`:
 // 0 = unused
@@ -17,6 +19,9 @@ terms of the MIT license. A copy of the license can be found in the file
 // 
 // 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space.
 // 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)).
+
+// 1MiB = 2^20*2^16 = 2^36 = 64GiB address space
+// 2^12 pointers = 2^15 k = 32k
 mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static bool        mi_page_map_all_committed = false;
 static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
@@ -25,7 +30,7 @@ static mi_memid_t  mi_page_map_memid;
 
 
 // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
-static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
+sstatic mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
                                           { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
 
 bool _mi_page_map_init(void) {
@@ -101,7 +106,7 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t*
 
 void _mi_page_map_register(mi_page_t* page) {
   mi_assert_internal(page != NULL);
-  mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
   if mi_unlikely(_mi_page_map == NULL) {
     if (!_mi_page_map_init()) return;
@@ -151,3 +156,137 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att
     return false;
   }
 }
+
+#else 
+
+mi_decl_cache_align uint8_t** _mi_page_map = NULL;
+
+static void*       mi_page_map_max_address = NULL;
+static mi_memid_t  mi_page_map_memid;
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    mi_assert_internal(vbits <= MI_MAX_VABITS);
+  }
+
+  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up(MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT + MI_INTPTR_SHIFT), os_page_size);
+  const size_t reserve_size = page_map_size + (2 * MI_PAGE_MAP_SUB_SIZE);  
+  _mi_page_map = (uint8_t**)_mi_os_alloc_aligned(reserve_size, 1, true /* commit */, true, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", reserve_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, reserve_size);
+  }
+
+  uint8_t* sub0 = (uint8_t*)_mi_page_map + page_map_size;
+  uint8_t* sub1 = sub0 + MI_PAGE_MAP_SUB_SIZE;
+  // initialize the first part so NULL pointers get resolved without an access violation
+  _mi_page_map[0] = sub0; 
+  sub0[0] = 1;                // so _mi_ptr_page(NULL) == NULL
+  // and initialize the 4GiB range where we were allocated 
+  _mi_page_map[_mi_page_map_index(_mi_page_map,NULL)] = sub1;
+
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page,sub_idx);
+}
+
+
+static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slice_count, uint8_t (*set)(uint8_t ofs)) {
+  // is the page map area that contains the page address committed?
+  uint8_t ofs = 1;
+  while (slice_count > 0) {
+    uint8_t* sub = _mi_page_map[idx];
+    if (sub == NULL) {
+      mi_memid_t memid;
+      sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid);
+      if (sub == NULL) {
+        _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
+        return; // abort?
+      }
+    }
+    // set the offsets for the page
+    while (sub_idx < MI_PAGE_MAP_SUB_SIZE && slice_count > 0) {
+      sub[sub_idx] = set(ofs);
+      sub_idx++;
+      ofs++;
+      slice_count--;
+    }
+    sub_idx = 0; // potentially wrap around to the next idx    
+  }  
+}
+
+static uint8_t set_ofs(uint8_t ofs) {
+  return ofs;
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  uint8_t* page_start;
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count);
+  mi_page_map_set_range(idx, sub_idx, slice_count, &set_ofs);
+}
+
+static uint8_t set_zero(uint8_t ofs) {
+  MI_UNUSED(ofs);
+  return 0;
+}
+
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  uint8_t* page_start;
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+
+  if mi_unlikely(p >= mi_page_map_max_address) return false;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  uint8_t* sub = _mi_page_map[idx];  
+  if (sub != NULL) {
+    return (sub[sub_idx] != 0);
+  }
+  else {
+    return false;
+  }
+}
+
+
+#endif
+
diff --git a/test/test-stress.c b/test/test-stress.c
index 0920a02e..bbcded65 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -302,8 +302,8 @@ int main(int argc, char** argv) {
     mi_option_enable(mi_option_visit_abandoned);
   #endif
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-    // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
-    mi_option_set(mi_option_purge_delay,10);
+    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    //mi_option_set(mi_option_purge_delay,10);
   #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();

From c9b2d31665b9102114569ccf78be1328c2843fe7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 23:17:11 -0800
Subject: [PATCH 125/264] fix page_map initialization

---
 src/page-map.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/page-map.c b/src/page-map.c
index a814610f..403be079 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -214,6 +214,12 @@ static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slic
     if (sub == NULL) {
       mi_memid_t memid;
       sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid);
+      uint8_t* expect = NULL;
+      if (!mi_atomic_cas_strong_acq_rel(((_Atomic(uint8_t*)*)&_mi_page_map[idx]), &expect, sub)) {
+        _mi_os_free(sub, MI_PAGE_MAP_SUB_SIZE, memid);
+        sub = expect;
+        mi_assert_internal(sub!=NULL);
+      }
       if (sub == NULL) {
         _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
         return; // abort?

From 93fa8d895ad7366285782cf1f1259fe427c4d631 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 12:18:53 -0800
Subject: [PATCH 126/264] revert back to flat address map

---
 include/mimalloc/bits.h     |   8 --
 include/mimalloc/internal.h |  65 ++--------
 src/free.c                  |   8 +-
 src/page-map.c              | 248 ++++++++----------------------------
 4 files changed, 66 insertions(+), 263 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index fb6c2e8c..32b9d528 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -68,14 +68,6 @@ typedef int32_t  mi_ssize_t;
 #define MI_MiB     (MI_KiB*MI_KiB)
 #define MI_GiB     (MI_MiB*MI_KiB)
 
-#if MI_INTPTR_SIZE > 4
-#define MI_MAX_VABITS     (48)
-#define MI_PAGE_MAP_FLAT  0
-#else
-#define MI_MAX_VABITS     (32)
-#define MI_PAGE_MAP_FLAT  1
-#endif
-
 
 /* --------------------------------------------------------------------------------
   Architecture
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index dbc45133..17c02941 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -169,6 +169,7 @@ bool        _mi_page_map_init(void);
 void        _mi_page_map_register(mi_page_t* page);
 void        _mi_page_map_unregister(mi_page_t* page);
 void        _mi_page_map_unregister_range(void* start, size_t size);
+mi_page_t*  _mi_safe_ptr_page(const void* p);
 
 // "page.c"
 void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -441,29 +442,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   Pages
 ----------------------------------------------------------- */
 
-#if MI_PAGE_MAP_FLAT
-
 // flat page-map committed on demand
 extern uint8_t* _mi_page_map;
 
-static inline uintptr_t _mi_page_map_index(const void* p) {
-  return (((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT);
+static inline size_t _mi_page_map_index(const void* p) {
+  return (size_t)((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
 }
 
 static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
-  #if 1
-  const uintptr_t idx = _mi_page_map_index(p);
+  const size_t idx = _mi_page_map_index(p);
   const size_t ofs = _mi_page_map[idx];
-  if (valid != NULL) *valid = (ofs != 0);
-  return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT);
-  #else
-  const uintptr_t idx = _mi_page_map_index(p);
-  const uintptr_t up   = idx << MI_ARENA_SLICE_SHIFT;
-  __builtin_prefetch((void*)up);
-  const size_t ofs = _mi_page_map[idx];
-  if (valid != NULL) *valid = (ofs != 0);
-  return (mi_page_t*)(up - ((ofs - 1) << MI_ARENA_SLICE_SHIFT));
-  #endif
+  if (valid != NULL) { *valid = (ofs != 0); }
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) + 1 - ofs) << MI_ARENA_SLICE_SHIFT);
 }
 
 static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
@@ -476,49 +466,10 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
   return _mi_ptr_page_ex(p, NULL);
 }
 
-#else 
-
-// 2-level page map
- 
-// one page-map directory = 64 KiB => covers 2^16 * 2^16 = 2^32 = 4 GiB address space
-// the page-map needs 48-16-16 = 16 bits => 2^16 map directories = 2^16 * 2^3 = 2^19 = 512 KiB size.
-// we commit the page-map directories on-demand. (2^16 * 2^16 = 2^32 ~= 4 GiB needed to cover 256 TeB)
-
-#define MI_PAGE_MAP_SUB_SHIFT     (16)    // 64 KiB 
-#define MI_PAGE_MAP_SUB_SIZE      (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
-#define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
-#define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
-
-extern uint8_t** _mi_page_map;
-
-static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
-  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
-  if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_SIZE; }
-  return (size_t)(u / MI_PAGE_MAP_COUNT);
-}
-
-static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
-  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
-  const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT];
-  const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE];
-  return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE);
-}
-
-static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
-  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
-  const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT];
-  //if mi_unlikely(sub == NULL) { return NULL; }
-  const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE];
-  //if mi_unlikely(ofs == 0) { return NULL;  }
-  return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE);
-}
-
-#endif
-
 static inline mi_page_t* _mi_ptr_page(const void* p) {
   mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
   #if MI_DEBUG || defined(__APPLE__)
-  return _mi_checked_ptr_page(p);
+  return _mi_checked_ptr_page(p); 
   #else
   return _mi_unchecked_ptr_page(p);
   #endif
@@ -637,7 +588,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
   return (page->free != NULL);
 }
 
-  
+
 // is the page not yet used up to its reserved space?
 static inline bool mi_page_is_expandable(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
diff --git a/src/free.c b/src/free.c
index 88f784c7..d08123a2 100644
--- a/src/free.c
+++ b/src/free.c
@@ -145,14 +145,14 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
-  #endif
-  mi_page_t* const page = _mi_ptr_page(p);
-  #if MI_DEBUG
+  mi_page_t* const page = _mi_safe_ptr_page(p);
   if (page == NULL && p != NULL) {
     _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
   }
-  #endif
   return page;
+  #else
+  return _mi_ptr_page(p);
+  #endif
 }
 
 // Free a block
diff --git a/src/page-map.c b/src/page-map.c
index 403be079..a4001359 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -9,60 +9,61 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 
-#if MI_PAGE_MAP_FLAT
 
-// The page-map contains a byte for each 64kb slice in the address space. 
-// For an address `a` where `n = _mi_page_map[a >> 16]`:
+// The page-map contains a byte for each 64kb slice in the address space.
+// For an address `a` where `ofs = _mi_page_map[a >> 16]`:
 // 0 = unused
 // 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
-// 1 < n << 127 = the slice is part of a page, starting at `(((a>>16) - n - 1) << 16)`.
-// 
-// 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space.
-// 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)).
+// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
+//
+// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
+// A full 256 TiB address space (48 bit) needs a 4 GiB page map.
+// A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
 
-// 1MiB = 2^20*2^16 = 2^36 = 64GiB address space
-// 2^12 pointers = 2^15 k = 32k
 mi_decl_cache_align uint8_t* _mi_page_map = NULL;
-static bool        mi_page_map_all_committed = false;
-static size_t      mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE;
-static void*       mi_page_map_max_address = NULL;
-static mi_memid_t  mi_page_map_memid;
+static void*        mi_page_map_max_address = NULL;
+static mi_memid_t   mi_page_map_memid;
 
+#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
+static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
 
-// (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization)
-sstatic mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0),
-                                          { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} };
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count);
 
 bool _mi_page_map_init(void) {
-  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);  
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
   if (vbits == 0) {
     vbits = _mi_os_virtual_address_bits();
-    #if MI_ARCH_X64
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
     if (vbits >= 48) { vbits = 47; }
     #endif
   }
-  
+
+  // Allocate the page map and commit bits
   mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
-
-  mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT);
-  // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true);
-
-  mi_page_map_all_committed = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
-  _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid);
-  if (_mi_page_map==NULL) {
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
+  const size_t reserve_size = bitmap_size + page_map_size;
+  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (base==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
     return false;
   }
   if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
-    _mi_warning_message("the page map was committed but not zero initialized!\n");
-    _mi_memzero_aligned(_mi_page_map, page_map_size);
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(base, reserve_size);
   }
+  if (bitmap_size > 0) {
+    mi_page_map_commit = (mi_bitmap_t*)base;
+    _mi_os_commit(mi_page_map_commit, bitmap_size, NULL);
+    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
+  }
+  _mi_page_map = base + bitmap_size;
+
   // commit the first part so NULL pointers get resolved without an access violation
-  if (!mi_page_map_all_committed) {
-    bool is_zero;
-    _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero);
-    if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); }
+  if (!commit) {
+    mi_page_map_ensure_committed(0, 1);
   }
   _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
   mi_assert_internal(_mi_ptr_page(NULL)==NULL);
@@ -70,30 +71,31 @@ bool _mi_page_map_init(void) {
 }
 
 static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
-  // is the page map area that contains the page address committed?  
+  // is the page map area that contains the page address committed?
   // we always set the commit bits so we can track what ranges are in-use.
   // we only actually commit if the map wasn't committed fully already.
-  const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit;
-  const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit;
-  for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
-    if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) {
-      // this may race, in which case we do multiple commits (which is ok)
-      if (!mi_page_map_all_committed) {
+  if (mi_page_map_commit != NULL) {
+    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
+        // this may race, in which case we do multiple commits (which is ok)        
         bool is_zero;
-        uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit);
-        const size_t   size = mi_page_map_entries_per_commit_bit;
+        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
         _mi_os_commit(start, size, &is_zero);
-        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }        
+        mi_bitmap_set(mi_page_map_commit, i);
       }
-      mi_bitmap_set(&mi_page_map_commit, i);
     }
   }
   #if MI_DEBUG > 0
   _mi_page_map[idx] = 0;
   _mi_page_map[idx+slice_count-1] = 0;
-  #endif  
+  #endif
 }
 
+
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
   size_t page_size;
   *page_start = mi_page_area(page, &page_size);
@@ -102,8 +104,6 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t*
   return _mi_page_map_index(page);
 }
 
-
-
 void _mi_page_map_register(mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
@@ -125,7 +125,6 @@ void _mi_page_map_register(mi_page_t* page) {
   }
 }
 
-
 void _mi_page_map_unregister(mi_page_t* page) {
   mi_assert_internal(_mi_page_map != NULL);
   // get index and count
@@ -143,156 +142,17 @@ void _mi_page_map_unregister_range(void* start, size_t size) {
   _mi_memzero(&_mi_page_map[index], slice_count);
 }
 
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  // if mi_unlikely(_mi_page_map==NULL) {  // happens on macOS during loading
-  //   _mi_page_map_init();  
-  // }
-  if mi_unlikely(p >= mi_page_map_max_address) return false;
-  uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
-  if (mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) {
-    return (_mi_page_map[idx] != 0);
-  }
-  else {
-    return false;
-  }
-}
 
-#else 
-
-mi_decl_cache_align uint8_t** _mi_page_map = NULL;
-
-static void*       mi_page_map_max_address = NULL;
-static mi_memid_t  mi_page_map_memid;
-
-bool _mi_page_map_init(void) {
-  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
-  if (vbits == 0) {
-    vbits = _mi_os_virtual_address_bits();
-    mi_assert_internal(vbits <= MI_MAX_VABITS);
-  }
-
-  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
-  const size_t os_page_size = _mi_os_page_size();
-  const size_t page_map_size = _mi_align_up(MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT + MI_INTPTR_SHIFT), os_page_size);
-  const size_t reserve_size = page_map_size + (2 * MI_PAGE_MAP_SUB_SIZE);  
-  _mi_page_map = (uint8_t**)_mi_os_alloc_aligned(reserve_size, 1, true /* commit */, true, &mi_page_map_memid);
-  if (_mi_page_map==NULL) {
-    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", reserve_size / MI_KiB);
-    return false;
-  }
-  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
-    _mi_warning_message("the page map was committed but not zero initialized!\n");
-    _mi_memzero_aligned(_mi_page_map, reserve_size);
-  }
-
-  uint8_t* sub0 = (uint8_t*)_mi_page_map + page_map_size;
-  uint8_t* sub1 = sub0 + MI_PAGE_MAP_SUB_SIZE;
-  // initialize the first part so NULL pointers get resolved without an access violation
-  _mi_page_map[0] = sub0; 
-  sub0[0] = 1;                // so _mi_ptr_page(NULL) == NULL
-  // and initialize the 4GiB range where we were allocated 
-  _mi_page_map[_mi_page_map_index(_mi_page_map,NULL)] = sub1;
-
-  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
-  return true;
-}
-
-static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* sub_idx, size_t* slice_count) {
-  size_t page_size;
-  *page_start = mi_page_area(page, &page_size);
-  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
-  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
-  return _mi_page_map_index(page,sub_idx);
-}
-
-
-static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slice_count, uint8_t (*set)(uint8_t ofs)) {
-  // is the page map area that contains the page address committed?
-  uint8_t ofs = 1;
-  while (slice_count > 0) {
-    uint8_t* sub = _mi_page_map[idx];
-    if (sub == NULL) {
-      mi_memid_t memid;
-      sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid);
-      uint8_t* expect = NULL;
-      if (!mi_atomic_cas_strong_acq_rel(((_Atomic(uint8_t*)*)&_mi_page_map[idx]), &expect, sub)) {
-        _mi_os_free(sub, MI_PAGE_MAP_SUB_SIZE, memid);
-        sub = expect;
-        mi_assert_internal(sub!=NULL);
-      }
-      if (sub == NULL) {
-        _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
-        return; // abort?
-      }
-    }
-    // set the offsets for the page
-    while (sub_idx < MI_PAGE_MAP_SUB_SIZE && slice_count > 0) {
-      sub[sub_idx] = set(ofs);
-      sub_idx++;
-      ofs++;
-      slice_count--;
-    }
-    sub_idx = 0; // potentially wrap around to the next idx    
-  }  
-}
-
-static uint8_t set_ofs(uint8_t ofs) {
-  return ofs;
-}
-
-void _mi_page_map_register(mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
-  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
-  if mi_unlikely(_mi_page_map == NULL) {
-    if (!_mi_page_map_init()) return;
-  }
-  mi_assert(_mi_page_map!=NULL);
-  uint8_t* page_start;
-  size_t   slice_count;
-  size_t   sub_idx;
-  const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count);
-  mi_page_map_set_range(idx, sub_idx, slice_count, &set_ofs);
-}
-
-static uint8_t set_zero(uint8_t ofs) {
-  MI_UNUSED(ofs);
-  return 0;
-}
-
-
-void _mi_page_map_unregister(mi_page_t* page) {
-  mi_assert_internal(_mi_page_map != NULL);
-  // get index and count
-  uint8_t* page_start;
-  size_t   slice_count;
-  size_t   sub_idx;
-  const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count);
-  // unset the offsets
-  mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero);
-}
-
-void _mi_page_map_unregister_range(void* start, size_t size) {
-  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
-  size_t sub_idx;
-  const size_t idx = _mi_page_map_index(start, &sub_idx);
-  mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero);
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  const uintptr_t idx = _mi_page_map_index(p);
+  if mi_unlikely(mi_page_map_commit == NULL || !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  const uintptr_t ofs = _mi_page_map[idx];
+  if mi_unlikely(ofs == 0) return NULL;
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-
-  if mi_unlikely(p >= mi_page_map_max_address) return false;
-  size_t sub_idx;
-  const size_t idx = _mi_page_map_index(p, &sub_idx);
-  uint8_t* sub = _mi_page_map[idx];  
-  if (sub != NULL) {
-    return (sub[sub_idx] != 0);
-  }
-  else {
-    return false;
-  }
+  return (_mi_safe_ptr_page(p) != NULL);
 }
 
-
-#endif
-

From 8d16303aa6a6d25975f01569b71b7127a0a8d559 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 12:21:31 -0800
Subject: [PATCH 127/264] add -mtune=native with opt arch

---
 CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebd02b20..07a292e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,7 +91,7 @@ endif()
 
 if (CMAKE_GENERATOR MATCHES "^Visual Studio.*$")
   message(STATUS "Note: when building with Visual Studio the build type is specified when building.")
-  message(STATUS "For example: 'cmake --build . --config=Release")  
+  message(STATUS "For example: 'cmake --build . --config=Release")
 endif()
 
 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
@@ -401,9 +401,9 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
   endif()
   if(MI_OPT_ARCH)
     if(MI_ARCH STREQUAL "x64")
-      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2")    # fast bit scan (since 2013)
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native")    # fast bit scan (since 2013)
     elseif(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics (since 2016)
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native")         # fast atomics (since 2016)
     endif()
   endif()
 endif()
@@ -557,7 +557,7 @@ if(MI_BUILD_SHARED)
     elseif(MI_ARCH STREQUAL "x64")
       set(MIMALLOC_REDIRECT_SUFFIX "")
       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
-        message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc-override.dll'") 
+        message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc-override.dll'")
         message(STATUS "      with 'mimalloc-redirect-arm64ec.dll'. See the 'bin\\readme.md' for more information.")
       endif()
     elseif(MI_ARCH STREQUAL "x86")
@@ -681,7 +681,7 @@ endif()
 # -----------------------------------------------------------------------------
 if (MI_OVERRIDE)
   if (MI_BUILD_SHARED)
-    target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)    
+    target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
   endif()
   if(NOT WIN32)
     # It is only possible to override malloc on Windows when building as a DLL.

From 3c7d7e1f11eeca0dec9d48119ed22f40e63ae518 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 14:07:57 -0800
Subject: [PATCH 128/264] experiment with 2 level pagemap

---
 include/mimalloc/bits.h     |  18 ++++
 include/mimalloc/internal.h |  43 +++++++++-
 src/page-map.c              | 162 ++++++++++++++++++++++++++++++++++++
 3 files changed, 222 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 32b9d528..ca0b5905 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -107,6 +107,24 @@ typedef int32_t  mi_ssize_t;
 // Define big endian if needed
 // #define MI_BIG_ENDIAN  1
 
+#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
+#define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#elif   MI_ARCH_X64
+#define MI_MAX_VABITS     (47)
+#elif MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#else
+#define MI_MAX_VABITS     (32)
+#endif
+
+#ifndef MI_PAGE_MAP_FLAT
+#if MI_MAX_VABITS <= 40
+#define MI_PAGE_MAP_FLAT  1
+#else
+#define MI_PAGE_MAP_FLAT  0
+#endif
+#endif
+
 
 /* --------------------------------------------------------------------------------
   Builtin's
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 17c02941..8955db5e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -442,6 +442,8 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   Pages
 ----------------------------------------------------------- */
 
+#if MI_PAGE_MAP_FLAT
+
 // flat page-map committed on demand
 extern uint8_t* _mi_page_map;
 
@@ -466,10 +468,49 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
   return _mi_ptr_page_ex(p, NULL);
 }
 
+#else
+
+// 2-level page map
+
+// one sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space
+// the page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size.
+// we commit the page-map and the sub maps on-demand.
+
+#define MI_PAGE_MAP_SUB_SHIFT     (13)
+#define MI_PAGE_MAP_SUB_COUNT     (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
+
+#define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
+#define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
+
+extern mi_page_t*** _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
+  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
+  if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_COUNT; }
+  return (size_t)(u / MI_PAGE_MAP_SUB_COUNT);
+}
+
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  return _mi_page_map[idx][sub_idx];
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub == NULL) return NULL;
+  return sub[sub_idx];
+}
+
+#endif
+
+
 static inline mi_page_t* _mi_ptr_page(const void* p) {
   mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
   #if MI_DEBUG || defined(__APPLE__)
-  return _mi_checked_ptr_page(p); 
+  return _mi_checked_ptr_page(p);
   #else
   return _mi_unchecked_ptr_page(p);
   #endif
diff --git a/src/page-map.c b/src/page-map.c
index a4001359..99a9b60a 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -9,6 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 
+#if MI_PAGE_MAP_FLAT 
 
 // The page-map contains a byte for each 64kb slice in the address space.
 // For an address `a` where `ofs = _mi_page_map[a >> 16]`:
@@ -156,3 +157,164 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att
   return (_mi_safe_ptr_page(p) != NULL);
 }
 
+#else
+
+mi_decl_cache_align mi_page_t*** _mi_page_map;
+static void*        mi_page_map_max_address;
+static mi_memid_t   mi_page_map_memid;
+
+static _Atomic(mi_bfield_t)  mi_page_map_commit; // one bit per committed 64 KiB entries
+
+static mi_page_t** mi_page_map_ensure_at(size_t idx);
+static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
+  const size_t reserve_size = page_map_size + os_page_size;
+  const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~0 : (mi_bfield_t)0));
+
+  // commit the first part so NULL pointers get resolved without an access violation
+  mi_page_map_ensure_at(0);
+  
+  // note: for the NULL range we only commit one OS page
+  // mi_page_map_set_range(NULL, 0, 0, 1);
+  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);
+  }
+  _mi_page_map[0][0] = NULL;
+
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
+  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
+  const size_t bit_idx = (idx*MI_INTPTR_SIZE)/MI_ARENA_SLICE_SIZE; // we commit a slice of entries at a time
+  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
+  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
+  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
+}
+
+static mi_page_t** mi_page_map_ensure_committed(size_t idx) {
+  size_t bit_idx;
+  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
+    uint8_t* start = (uint8_t*)_mi_page_map + (bit_idx * MI_ARENA_SLICE_SIZE);
+    _mi_os_commit(start, MI_ARENA_SLICE_SIZE, NULL);
+    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
+  }
+  return _mi_page_map[idx];
+}
+
+static mi_page_t** mi_page_map_ensure_at(size_t idx) {
+  mi_page_t** sub = mi_page_map_ensure_committed(idx);
+  if mi_unlikely(sub == NULL) {
+    // sub map not yet allocated, alloc now
+    mi_memid_t memid;
+    sub = (mi_page_t**)_mi_os_alloc(MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), &memid);
+    mi_page_t** expect = NULL;
+    if (!mi_atomic_cas_strong_acq_rel(((_Atomic(mi_page_t**)*)&_mi_page_map[idx]), &expect, sub)) {
+      // another thread already allocated it.. free and continue
+      _mi_os_free(sub, MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), memid);
+      sub = expect;
+      mi_assert_internal(sub!=NULL);
+    }
+    if (sub == NULL) {
+      _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
+    }
+  }
+  return sub;
+}
+
+static void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  while (slice_count > 0) {
+    mi_page_t** sub = mi_page_map_ensure_at(idx);
+    // set the offsets for the page
+    while (sub_idx < MI_PAGE_MAP_SUB_COUNT) {
+      sub[sub_idx] = page;
+      slice_count--; if (slice_count == 0) return;      
+      sub_idx++;      
+    }
+    idx++; // potentially wrap around to the next idx
+    sub_idx = 0; 
+  }
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page, sub_idx);
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  size_t slice_count;
+  size_t sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p,&sub_idx);
+  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub==NULL) return NULL;
+  return sub[sub_idx];
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#endif

From a42a2a926b5fd68a40bd7b75d1362d5c1f4e7d1b Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 14:18:33 -0800
Subject: [PATCH 129/264] improving level 2 page-map

---
 include/mimalloc/internal.h | 11 ++++++-----
 src/page-map.c              | 17 ++++++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 8955db5e..5dc2074d 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -470,11 +470,12 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
 
 #else
 
-// 2-level page map
-
-// one sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space
-// the page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size.
-// we commit the page-map and the sub maps on-demand.
+// 2-level page map:
+// The page-map is usually 4 MiB and points to sub maps of 64 KiB. 
+// The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well)
+// One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space
+// The page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size.
+// (Choosing a MI_PAGE_MAP_SUB_SHIFT of 16 gives slightly better code but will commit the initial sub-map at 512 KiB)
 
 #define MI_PAGE_MAP_SUB_SHIFT     (13)
 #define MI_PAGE_MAP_SUB_COUNT     (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
diff --git a/src/page-map.c b/src/page-map.c
index 99a9b60a..5a25b839 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -159,11 +159,13 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att
 
 #else
 
+// A 2-level page map
+
 mi_decl_cache_align mi_page_t*** _mi_page_map;
 static void*        mi_page_map_max_address;
 static mi_memid_t   mi_page_map_memid;
 
-static _Atomic(mi_bfield_t)  mi_page_map_commit; // one bit per committed 64 KiB entries
+static _Atomic(mi_bfield_t)  mi_page_map_commit; 
 
 static mi_page_t** mi_page_map_ensure_at(size_t idx);
 static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
@@ -178,8 +180,10 @@ bool _mi_page_map_init(void) {
   }
 
   // Allocate the page map and commit bits
+  mi_assert(MI_MAX_VABITS >= vbits);
   mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
   const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  mi_assert(page_map_count <= MI_PAGE_MAP_COUNT);
   const size_t os_page_size = _mi_os_page_size();
   const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
   const size_t reserve_size = page_map_size + os_page_size;
@@ -193,7 +197,7 @@ bool _mi_page_map_init(void) {
     _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
     _mi_memzero_aligned(_mi_page_map, page_map_size);
   }
-  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~0 : (mi_bfield_t)0));
+  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0)));
 
   // commit the first part so NULL pointers get resolved without an access violation
   mi_page_map_ensure_at(0);
@@ -210,9 +214,12 @@ bool _mi_page_map_init(void) {
   return true;
 }
 
+
+#define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS)
+
 static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
   mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
-  const size_t bit_idx = (idx*MI_INTPTR_SIZE)/MI_ARENA_SLICE_SIZE; // we commit a slice of entries at a time
+  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT; 
   mi_assert_internal(bit_idx < MI_BFIELD_BITS);
   if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
   return ((commit & (MI_ZU(1) << bit_idx)) != 0);
@@ -221,8 +228,8 @@ static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
 static mi_page_t** mi_page_map_ensure_committed(size_t idx) {
   size_t bit_idx;
   if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
-    uint8_t* start = (uint8_t*)_mi_page_map + (bit_idx * MI_ARENA_SLICE_SIZE);
-    _mi_os_commit(start, MI_ARENA_SLICE_SIZE, NULL);
+    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
+    _mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_page_t**), NULL);
     mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
   }
   return _mi_page_map[idx];

From c5cfc92f0cc8809d7fdd5e86c67321d90dd33a04 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 14:39:57 -0800
Subject: [PATCH 130/264] small fixes

---
 include/mimalloc/bits.h | 2 ++
 src/arena-meta.c        | 2 +-
 src/page-map.c          | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index ca0b5905..ed4a7b44 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -107,6 +107,7 @@ typedef int32_t  mi_ssize_t;
 // Define big endian if needed
 // #define MI_BIG_ENDIAN  1
 
+// maximum virtual address bits in a user-space pointer
 #if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
 #define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
 #elif   MI_ARCH_X64
@@ -117,6 +118,7 @@ typedef int32_t  mi_ssize_t;
 #define MI_MAX_VABITS     (32)
 #endif
 
+// use a flat page-map (or a 2-level one)
 #ifndef MI_PAGE_MAP_FLAT
 #if MI_MAX_VABITS <= 40
 #define MI_PAGE_MAP_FLAT  1
diff --git a/src/arena-meta.c b/src/arena-meta.c
index 065a1331..fcfb680c 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -25,7 +25,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
 #define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
 
-#define MI_META_BLOCK_SIZE        (64)
+#define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
 #define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
 #define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
 #define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
diff --git a/src/page-map.c b/src/page-map.c
index 5a25b839..190be6c0 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -147,7 +147,7 @@ void _mi_page_map_unregister_range(void* start, size_t size) {
 mi_page_t* _mi_safe_ptr_page(const void* p) {
   if mi_unlikely(p >= mi_page_map_max_address) return NULL;
   const uintptr_t idx = _mi_page_map_index(p);
-  if mi_unlikely(mi_page_map_commit == NULL || !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
   const uintptr_t ofs = _mi_page_map[idx];
   if mi_unlikely(ofs == 0) return NULL;
   return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);

From 516e644359685d38d035e76b1ac7d40df0c22edc Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 16:06:49 -0800
Subject: [PATCH 131/264] rename option pagemap_commit; always commit the page
 map on macos (for now)

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 include/mimalloc.h          |  4 ++--
 include/mimalloc/bits.h     |  2 +-
 include/mimalloc/internal.h |  3 +++
 src/alloc.c                 |  2 +-
 src/options.c               | 11 ++++++++++-
 src/page-map.c              |  4 ++--
 test/main-override-static.c |  2 +-
 8 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 87e866bb..2c4477d9 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -190,7 +190,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_SECURE=4;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/include/mimalloc.h b/include/mimalloc.h
index b0a20e9e..8bff8923 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -398,8 +398,8 @@ typedef enum mi_option_e {
   mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
   mi_option_full_page_retain,           // retain N full pages per size class (=2)
   mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4)
-  mi_option_max_vabits,                 // max virtual address bits to consider in user space (=48)
-  mi_option_debug_commit_full_pagemap,  // commit the full pagemap to catch invalid pointer uses (=0)
+  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
+  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index ed4a7b44..875f6230 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -120,7 +120,7 @@ typedef int32_t  mi_ssize_t;
 
 // use a flat page-map (or a 2-level one)
 #ifndef MI_PAGE_MAP_FLAT
-#if MI_MAX_VABITS <= 40
+#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) 
 #define MI_PAGE_MAP_FLAT  1
 #else
 #define MI_PAGE_MAP_FLAT  0
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 5dc2074d..9146896c 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -445,6 +445,7 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 #if MI_PAGE_MAP_FLAT
 
 // flat page-map committed on demand
+// single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses)
 extern uint8_t* _mi_page_map;
 
 static inline size_t _mi_page_map_index(const void* p) {
@@ -471,6 +472,8 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
 #else
 
 // 2-level page map:
+// double indirection but low commit and low virtual reserve.
+// 
 // The page-map is usually 4 MiB and points to sub maps of 64 KiB. 
 // The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well)
 // One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space
diff --git a/src/alloc.c b/src/alloc.c
index e5f2b8ae..6b037987 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -674,7 +674,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
     #if MI_STAT>1
     mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
     #endif
-    _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1);
+    mi_heap_stat_counter_increase(heap, guarded_alloc_count, 1);
   }
   #if MI_DEBUG>3
   if (p != NULL && zero) {
diff --git a/src/options.c b/src/options.c
index 4f1a00b8..fc3a2838 100644
--- a/src/options.c
+++ b/src/options.c
@@ -102,6 +102,14 @@ typedef struct mi_option_desc_s {
 #endif
 #endif
 
+#ifndef MI_DEFAULT_PAGEMAP_COMMIT
+#if defined(__APPLE__)
+#define MI_DEFAULT_PAGEMAP_COMMIT 1
+#else
+#define MI_DEFAULT_PAGEMAP_COMMIT 0
+#endif
+#endif
+
 
 static mi_option_desc_t options[_mi_option_last] =
 {
@@ -165,7 +173,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 2,   UNINIT, MI_OPTION(full_page_retain) },
   { 4,   UNINIT, MI_OPTION(max_page_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },
-  { 0,   UNINIT, MI_OPTION(debug_commit_full_pagemap) },
+  { MI_DEFAULT_PAGEMAP_COMMIT, 
+         UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page-map.c b/src/page-map.c
index 190be6c0..37ce3082 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -42,7 +42,7 @@ bool _mi_page_map_init(void) {
   // Allocate the page map and commit bits
   mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
-  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
   const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
   const size_t reserve_size = bitmap_size + page_map_size;
@@ -187,7 +187,7 @@ bool _mi_page_map_init(void) {
   const size_t os_page_size = _mi_os_page_size();
   const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
   const size_t reserve_size = page_map_size + os_page_size;
-  const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_pagemap_commit); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 410764bd..b16864db 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -35,7 +35,7 @@ int main() {
   // corrupt_free();
   // block_overflow1();
   // block_overflow2();
-  // test_canary_leak();
+  test_canary_leak();
   // test_aslr();
   // invalid_free();
   // test_reserved();

From 773fe7ae5b914821a1d201fd47b2e12870516f5a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 17:25:58 -0800
Subject: [PATCH 132/264] support full secure build

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 include/mimalloc/types.h    | 12 ++++++-----
 src/arena-meta.c            | 38 ++++++++++++++++++++++----------
 src/arena.c                 | 43 ++++++++++++++++++++++++++++++-------
 src/os.c                    |  4 ++--
 src/page.c                  | 18 ++++++++--------
 src/prim/unix/prim.c        |  2 +-
 7 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 2c4477d9..dc112272 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -190,7 +190,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_SECURE=4;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 7009a017..84179458 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -46,11 +46,13 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
 
-// Define MI_SECURE to enable security mitigations
-// #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 2  // guard page around each mimalloc page
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+// Define MI_SECURE to enable security mitigations. The lowest two have minimal performance impact:
+//   #define MI_SECURE 1  // guard page around metadata
+//   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large heaps..)
+// 
+// The next two levels can have more performance cost:
+//   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//   #define MI_SECURE 4  // checks for double free. (may be more expensive)
 
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
diff --git a/src/arena-meta.c b/src/arena-meta.c
index fcfb680c..a916706b 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -25,6 +25,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
 #define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
 
+#if MI_SECURE 
+#define MI_META_PAGE_GUARD_SIZE   (4*MI_KiB)
+#else
+#define MI_META_PAGE_GUARD_SIZE   (0)
+#endif
+
 #define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
 #define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
 #define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
@@ -41,7 +47,7 @@ static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_V
 
 #if MI_DEBUG > 1
 static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
-  mi_meta_page_t* mpage = (mi_meta_page_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN);
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + MI_META_PAGE_GUARD_SIZE);
   if (block_idx != NULL) {
     *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
   }
@@ -54,9 +60,9 @@ static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
 }
 
 static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
-  mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_ALIGN));
   mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
-  void* p = ((uint8_t*)mpage + (block_idx * MI_META_BLOCK_SIZE));
+  void* p = ((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE + (block_idx * MI_META_BLOCK_SIZE));
   mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
   return p;
 }
@@ -66,22 +72,32 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
   // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
   mi_memid_t memid;
-  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
-                                                                   true /* commit*/, true /* allow large */,
+  uint8_t* base = (uint8_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                   true /* commit*/, (MI_SECURE==0) /* allow large? */,
                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
-  if (mpage == NULL) return NULL;
-  mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
+  if (base == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
-    _mi_memzero_aligned(mpage, MI_ARENA_SLICE_SIZE);
+    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
   }
 
-  // initialize the page
+  // guard pages
+  #if MI_SECURE 
+  if (!memid.is_pinned) {
+    _mi_os_decommit(base, MI_META_PAGE_GUARD_SIZE);
+    _mi_os_decommit(base + MI_META_PAGE_SIZE - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_GUARD_SIZE);
+  }
+  #endif
+
+  // initialize the page and free block bitmap
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + MI_META_PAGE_GUARD_SIZE);
   mpage->memid = memid;
   mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
   const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
   const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
-  mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE);
-  mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks);
+  const size_t guard_blocks = _mi_divide_up(MI_META_PAGE_GUARD_SIZE, MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);  
+  mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
 
   // push atomically in front of the meta page list
   // (note: there is no ABA issue since we never free meta-pages)
diff --git a/src/arena.c b/src/arena.c
index 0cea5776..aa8ba416 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -577,10 +577,16 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
   return NULL;
 }
 
+#if MI_SECURE < 2
+#define MI_ARENA_GUARD_PAGE_SIZE  (0)
+#else
+#define MI_ARENA_GUARD_PAGE_SIZE  (4*MI_KiB)
+#endif
+
 static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
                                             mi_arena_t* req_arena, size_t tseq)
 {
-  const bool allow_large = true;
+  const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
   const bool commit = true;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
@@ -615,6 +621,14 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
+  // guard page at the end
+  const size_t page_body_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE;
+  #if MI_SECURE >= 2
+  if (memid.initially_committed && !memid.is_pinned) {
+    _mi_os_decommit((uint8_t*)page + page_body_size, MI_ARENA_GUARD_PAGE_SIZE);
+  }
+  #endif
+
   // claimed free slices: initialize the page partly
   if (!memid.initially_zero) {
     mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE);
@@ -625,7 +639,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   }
   #if MI_DEBUG > 1
   if (memid.initially_zero) {
-    if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) {
+    if (!mi_mem_is_zero(page, page_body_size)) {
       _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
       memid.initially_zero = false;
       _mi_memzero_aligned(page, sizeof(*page));
@@ -655,7 +669,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
     // otherwise start after the info
     block_start = mi_page_info_size();
   }
-  const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
+  const size_t reserved    = (os_align ? 1 : (page_body_size - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
@@ -708,7 +722,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
-  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size + MI_ARENA_GUARD_PAGE_SIZE);
 
   mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
   if (page == NULL) return NULL;
@@ -717,6 +731,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si
   mi_assert(page->reserved == 1);
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  _mi_page_init(heap, page);
 
   return page;
 }
@@ -774,6 +789,13 @@ void _mi_arena_page_free(mi_page_t* page) {
   }
   #endif
 
+  // recommit guard page at the end?
+  #if MI_SECURE >= 2
+  if (!page->memid.is_pinned) {
+    _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL);
+  }
+  #endif
+
   // unregister page
   _mi_page_map_unregister(page);
   if (page->memid.memkind == MI_MEM_ARENA) {
@@ -1114,12 +1136,17 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
   mi_arena_t* arena = (mi_arena_t*)start;
 
   // commit & zero if needed
-  bool is_zero = memid.initially_zero;
+  const size_t os_page_size = _mi_os_page_size();  
   if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL);
+    // security: always leave a guard OS page decommitted at the end (already part of info_slices)
+    _mi_os_commit(arena, mi_size_of_slices(info_slices) - os_page_size, NULL);
   }
-  if (!is_zero) {
-    _mi_memzero(arena, mi_size_of_slices(info_slices));
+  else if (!memid.is_pinned) {
+    // security: decommit a guard OS page at the end of the arena info
+    _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - os_page_size, os_page_size);
+  }
+  if (!memid.initially_zero) {
+    _mi_memzero(arena, mi_size_of_slices(info_slices) - os_page_size);
   }
 
   // init
diff --git a/src/os.c b/src/os.c
index 53e8f571..80d44d12 100644
--- a/src/os.c
+++ b/src/os.c
@@ -536,8 +536,8 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     start = huge_start;
     if (start == 0) {
       // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
       uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
     #endif
diff --git a/src/page.c b/src/page.c
index 200cdaa9..6030161a 100644
--- a/src/page.c
+++ b/src/page.c
@@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
-  
+
   // const size_t bsize = mi_page_block_size(page);
   // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
@@ -475,7 +475,7 @@ static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) {
 
 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
   MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
@@ -533,7 +533,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
   MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
@@ -561,7 +561,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */
 
 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@@ -574,7 +574,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // extra test in malloc? or cache effects?)
 static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
   if (page->free != NULL) return;
@@ -605,7 +605,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(extend < (1UL<<16));
 
   // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
     mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
   }
   else {
@@ -621,7 +621,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
 void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
   mi_page_set_heap(page, heap);
-  
+
   size_t page_size;
   uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);
@@ -653,7 +653,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   #endif
   mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
-  
+
   // initialize an initial free list
   mi_page_extend_free(heap,page);
   mi_assert(mi_page_immediate_available(page));
@@ -740,7 +740,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   } // for each page
 
   mi_debug_heap_stat_counter_increase(heap, searches, count);
-  
+
   // set the page to the best candidate
   if (page_candidate != NULL) {
     page = page_candidate;
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index eb351f69..b47fff90 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -412,7 +412,7 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   int err = 0;
   // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
   err = unix_madvise(start, size, MADV_DONTNEED);
-  #if !MI_DEBUG && !MI_SECURE
+  #if !MI_DEBUG && MI_SECURE<=2
     *needs_recommit = false;
   #else
     *needs_recommit = true;

From 9ecadaecd5c04f6ddd7597d665f42329b9c502ab Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 17:55:56 -0800
Subject: [PATCH 133/264] clean up

---
 src/arena.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index aa8ba416..b9fbef05 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -583,6 +583,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
 #define MI_ARENA_GUARD_PAGE_SIZE  (4*MI_KiB)
 #endif
 
+// Allocate a fresh page
 static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
                                             mi_arena_t* req_arena, size_t tseq)
 {
@@ -622,10 +623,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
   // guard page at the end
-  const size_t page_body_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE;
+  const size_t page_noguard_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE;
   #if MI_SECURE >= 2
   if (memid.initially_committed && !memid.is_pinned) {
-    _mi_os_decommit((uint8_t*)page + page_body_size, MI_ARENA_GUARD_PAGE_SIZE);
+    _mi_os_decommit((uint8_t*)page + page_noguard_size, MI_ARENA_GUARD_PAGE_SIZE);
   }
   #endif
 
@@ -639,7 +640,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   }
   #if MI_DEBUG > 1
   if (memid.initially_zero) {
-    if (!mi_mem_is_zero(page, page_body_size)) {
+    if (!mi_mem_is_zero(page, page_noguard_size)) {
       _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
       memid.initially_zero = false;
       _mi_memzero_aligned(page, sizeof(*page));
@@ -669,7 +670,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
     // otherwise start after the info
     block_start = mi_page_info_size();
   }
-  const size_t reserved    = (os_align ? 1 : (page_body_size - block_start) / block_size);
+  const size_t reserved    = (os_align ? 1 : (page_noguard_size - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
@@ -695,7 +696,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   return page;
 }
 
-static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) {
+// Allocate a regular small/medium/large page.
+static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) {
   mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
 
@@ -716,21 +718,22 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
   return NULL;
 }
 
-
-static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
+// Allocate a page containing one block (very large, or with large alignment)
+static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
   mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
-  const size_t slice_count = mi_slice_count_of_size(info_size + block_size + MI_ARENA_GUARD_PAGE_SIZE);
+  #if MI_ARENA_GUARD_PAGE_SIZE == 0
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  #else
+  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE);
+  #endif
 
   mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
   if (page == NULL) return NULL;
 
-  mi_assert(page != NULL);
   mi_assert(page->reserved == 1);
-  mi_assert_internal(_mi_ptr_page(page)==page);
-  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
   _mi_page_init(heap, page);
 
   return page;
@@ -741,19 +744,19 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
   mi_page_t* page;
   if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
     mi_assert_internal(_mi_is_power_of_two(block_alignment));
-    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
+    page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment);
   }
   else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
-    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
   }
   else {
-    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
+    page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment);
   }
   // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));

From db82baf1a8f2952d83c6df91bee9cca4a463e0eb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 18:09:16 -0800
Subject: [PATCH 134/264] cleanup, some renaming

---
 include/mimalloc/internal.h | 241 ++++++++++++++++++------------------
 src/arena-meta.c            |   8 +-
 src/arena.c                 |  58 ++++-----
 src/free.c                  |   8 +-
 src/heap.c                  |   4 +-
 src/init.c                  |   2 +-
 src/page.c                  |   6 +-
 7 files changed, 159 insertions(+), 168 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 9146896c..041e7653 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -57,171 +57,168 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 // "libc.c"
-#include    <stdarg.h>
-void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
-void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
-char        _mi_toupper(char c);
-int         _mi_strnicmp(const char* s, const char* t, size_t n);
-void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
-void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
-size_t      _mi_strlen(const char* s);
-size_t      _mi_strnlen(const char* s, size_t max_len);
-bool        _mi_getenv(const char* name, char* result, size_t result_size);
+#include <stdarg.h>
+void          _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+void          _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char          _mi_toupper(char c);
+int           _mi_strnicmp(const char* s, const char* t, size_t n);
+void          _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void          _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t        _mi_strlen(const char* s);
+size_t        _mi_strnlen(const char* s, size_t max_len);
+bool          _mi_getenv(const char* name, char* result, size_t result_size);
 
 // "options.c"
-void        _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
-void        _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
-void        _mi_warning_message(const char* fmt, ...);
-void        _mi_verbose_message(const char* fmt, ...);
-void        _mi_trace_message(const char* fmt, ...);
-void        _mi_output_message(const char* fmt, ...);
-void        _mi_options_init(void);
-long        _mi_option_get_fast(mi_option_t option);
-void        _mi_error_message(int err, const char* fmt, ...);
+void          _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void          _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void          _mi_warning_message(const char* fmt, ...);
+void          _mi_verbose_message(const char* fmt, ...);
+void          _mi_trace_message(const char* fmt, ...);
+void          _mi_output_message(const char* fmt, ...);
+void          _mi_options_init(void);
+long          _mi_option_get_fast(mi_option_t option);
+void          _mi_error_message(int err, const char* fmt, ...);
 
 // random.c
-void        _mi_random_init(mi_random_ctx_t* ctx);
-void        _mi_random_init_weak(mi_random_ctx_t* ctx);
-void        _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
-void        _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
-uintptr_t   _mi_random_next(mi_random_ctx_t* ctx);
-uintptr_t   _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
+void          _mi_random_init(mi_random_ctx_t* ctx);
+void          _mi_random_init_weak(mi_random_ctx_t* ctx);
+void          _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void          _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t     _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t     _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t     _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
-void        _mi_process_load(void);
+void          _mi_process_load(void);
 void mi_cdecl _mi_process_done(void);
-bool        _mi_is_redirected(void);
-bool        _mi_allocator_init(const char** message);
-void        _mi_allocator_done(void);
-bool        _mi_is_main_thread(void);
-size_t      _mi_current_thread_count(void);
-bool        _mi_preloading(void);           // true while the C runtime is not initialized yet
-void        _mi_thread_done(mi_heap_t* heap);
+bool          _mi_is_redirected(void);
+bool          _mi_allocator_init(const char** message);
+void          _mi_allocator_done(void);
+bool          _mi_is_main_thread(void);
+size_t        _mi_current_thread_count(void);
+bool          _mi_preloading(void);           // true while the C runtime is not initialized yet
+void          _mi_thread_done(mi_heap_t* heap);
 
-mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
+mi_tld_t*     _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 mi_subproc_t* _mi_subproc(void);
 mi_subproc_t* _mi_subproc_main(void);
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
-
-mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
-mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 void          _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
-void        _mi_os_init(void);                                            // called from process init
-void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
-void*       _mi_os_zalloc(size_t size, mi_memid_t* memid);
-void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
-void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
+void          _mi_os_init(void);                                            // called from process init
+void*         _mi_os_alloc(size_t size, mi_memid_t* memid);
+void*         _mi_os_zalloc(size_t size, mi_memid_t* memid);
+void          _mi_os_free(void* p, size_t size, mi_memid_t memid);
+void          _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
 
-size_t      _mi_os_page_size(void);
-size_t      _mi_os_good_alloc_size(size_t size);
-bool        _mi_os_has_overcommit(void);
-bool        _mi_os_has_virtual_reserve(void);
-size_t      _mi_os_virtual_address_bits(void);
+size_t        _mi_os_page_size(void);
+size_t        _mi_os_good_alloc_size(size_t size);
+bool          _mi_os_has_overcommit(void);
+bool          _mi_os_has_virtual_reserve(void);
+size_t        _mi_os_virtual_address_bits(void);
 
-bool        _mi_os_reset(void* addr, size_t size);
-bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
-bool        _mi_os_decommit(void* addr, size_t size);
-bool        _mi_os_protect(void* addr, size_t size);
-bool        _mi_os_unprotect(void* addr, size_t size);
-bool        _mi_os_purge(void* p, size_t size);
-bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset);
+bool          _mi_os_reset(void* addr, size_t size);
+bool          _mi_os_commit(void* p, size_t size, bool* is_zero);
+bool          _mi_os_decommit(void* addr, size_t size);
+bool          _mi_os_protect(void* addr, size_t size);
+bool          _mi_os_unprotect(void* addr, size_t size);
+bool          _mi_os_purge(void* p, size_t size);
+bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset);
 
-void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
-void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
+void*         _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
+void*         _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
 
-void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-bool        _mi_os_use_large_page(size_t size, size_t alignment);
-size_t      _mi_os_large_page_size(void);
+void*         _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool          _mi_os_use_large_page(size_t size, size_t alignment);
+size_t        _mi_os_large_page_size(void);
 
-void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+void*         _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
+mi_arena_t*   _mi_arena_from_id(mi_arena_id_t id);
+bool          _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 
-void*       _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
-void*       _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
-void        _mi_arena_free(void* p, size_t size, mi_memid_t memid);
-bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
-bool        _mi_arena_contains(const void* p);
-void        _mi_arenas_collect(bool force_purge);
-void        _mi_arena_unsafe_destroy_all(void);
+void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_arenas_contain(const void* p);
+void          _mi_arenas_collect(bool force_purge);
+void          _mi_arenas_unsafe_destroy_all(void);
 
-mi_page_t*  _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
-void        _mi_arena_page_free(mi_page_t* page);
-void        _mi_arena_page_abandon(mi_page_t* page);
-void        _mi_arena_page_unabandon(mi_page_t* page);
-bool        _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page);
+mi_page_t*    _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
+void          _mi_arenas_page_free(mi_page_t* page);
+void          _mi_arenas_page_abandon(mi_page_t* page);
+void          _mi_arenas_page_unabandon(mi_page_t* page);
+bool          _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page);
 
 // arena-meta.c
-void*       _mi_meta_zalloc( size_t size, mi_memid_t* memid );
-void        _mi_meta_free(void* p, size_t size, mi_memid_t memid);
-bool        _mi_meta_is_meta_page(void* p);
+void*         _mi_meta_zalloc( size_t size, mi_memid_t* memid );
+void          _mi_meta_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_meta_is_meta_page(void* p);
 
 // "page-map.c"
-bool        _mi_page_map_init(void);
-void        _mi_page_map_register(mi_page_t* page);
-void        _mi_page_map_unregister(mi_page_t* page);
-void        _mi_page_map_unregister_range(void* start, size_t size);
-mi_page_t*  _mi_safe_ptr_page(const void* p);
+bool          _mi_page_map_init(void);
+void          _mi_page_map_register(mi_page_t* page);
+void          _mi_page_map_unregister(mi_page_t* page);
+void          _mi_page_map_unregister_range(void* start, size_t size);
+mi_page_t*    _mi_safe_ptr_page(const void* p);
 
 // "page.c"
-void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+void*         _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
 
-void        _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
-void        _mi_page_unfull(mi_page_t* page);
-void        _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);               // free the page
-void        _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void        _mi_page_force_abandon(mi_page_t* page);
-void        _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+void          _mi_page_retire(mi_page_t* page) mi_attr_noexcept;       // free the page if there are no other pages with many free blocks
+void          _mi_page_unfull(mi_page_t* page);
+void          _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);     // free the page
+void          _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);  // abandon the page, to be picked up by another thread...
+void          _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
-size_t      _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
-void        _mi_deferred_free(mi_heap_t* heap, bool force);
+size_t        _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void          _mi_deferred_free(mi_heap_t* heap, bool force);
 
-void        _mi_page_free_collect(mi_page_t* page,bool force);
-void        _mi_page_init(mi_heap_t* heap, mi_page_t* page);
+void          _mi_page_free_collect(mi_page_t* page,bool force);
+void          _mi_page_init(mi_heap_t* heap, mi_page_t* page);
 
-size_t      _mi_bin_size(uint8_t bin);           // for stats
-uint8_t     _mi_bin(size_t size);                // for stats
+size_t        _mi_bin_size(uint8_t bin); // for stats
+uint8_t       _mi_bin(size_t size);      // for stats
 
 // "heap.c"
-mi_heap_t*  _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld);
-void        _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld);
-void        _mi_heap_destroy_pages(mi_heap_t* heap);
-void        _mi_heap_collect_abandon(mi_heap_t* heap);
-void        _mi_heap_set_default_direct(mi_heap_t* heap);
-bool        _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
-void        _mi_heap_unsafe_destroy_all(void);
-mi_heap_t*  _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
-void        _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
-bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
-void        _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
+mi_heap_t*    _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld);
+void          _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld);
+void          _mi_heap_destroy_pages(mi_heap_t* heap);
+void          _mi_heap_collect_abandon(mi_heap_t* heap);
+void          _mi_heap_set_default_direct(mi_heap_t* heap);
+bool          _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void          _mi_heap_unsafe_destroy_all(void);
+mi_heap_t*    _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void          _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+bool          _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
+void          _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
 
 // "stats.c"
-void        _mi_stats_done(mi_stats_t* stats);
-void        _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from);
-mi_msecs_t  _mi_clock_now(void);
-mi_msecs_t  _mi_clock_end(mi_msecs_t start);
-mi_msecs_t  _mi_clock_start(void);
+void          _mi_stats_done(mi_stats_t* stats);
+void          _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from);
+mi_msecs_t    _mi_clock_now(void);
+mi_msecs_t    _mi_clock_end(mi_msecs_t start);
+mi_msecs_t    _mi_clock_start(void);
 
 // "alloc.c"
-void*       _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;                  // called from `_mi_heap_malloc_aligned`
-void*       _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;           // called from `_mi_heap_malloc_aligned`
-void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
-void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
-void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
-void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+void*         _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*         _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;                  // called from `_mi_heap_malloc_aligned`
+void*         _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;           // called from `_mi_heap_malloc_aligned`
+void*         _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*         _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
+void*         _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t*   _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
+void          _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 #if MI_DEBUG>1
-bool        _mi_page_is_valid(mi_page_t* page);
+bool          _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
@@ -718,8 +715,8 @@ static inline bool _mi_page_unown(mi_page_t* page) {
     while mi_unlikely(mi_tf_block(tf_old) != NULL) {
       _mi_page_free_collect(page, false);  // update used
       if (mi_page_all_free(page)) {        // it may become free just before unowning it
-        _mi_arena_page_unabandon(page);
-        _mi_arena_page_free(page);
+        _mi_arenas_page_unabandon(page);
+        _mi_arenas_page_free(page);
         return true;
       }
       tf_old = mi_atomic_load_relaxed(&page->xthread_free);
diff --git a/src/arena-meta.c b/src/arena-meta.c
index a916706b..34be6e0e 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -72,9 +72,9 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
   // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
   mi_memid_t memid;
-  uint8_t* base = (uint8_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
-                                                                   true /* commit*/, (MI_SECURE==0) /* allow large? */,
-                                                                   NULL /* req arena */, 0 /* thread_seq */, &memid);
+  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
+                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
   if (base == NULL) return NULL;
   mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
@@ -165,7 +165,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL);
   }
   else {
-    _mi_arena_free(p,size,memid);
+    _mi_arenas_free(p,size,memid);
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index b9fbef05..7a016165 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -467,7 +467,7 @@ static void* mi_arena_os_alloc_aligned(
 
 
 // Allocate large sized memory
-void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
+void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc,
   size_t size, size_t alignment, size_t align_offset,
   bool commit, bool allow_large,
   mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
@@ -493,9 +493,9 @@ void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
   return p;
 }
 
-void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
+void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
+  return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
 }
 
 
@@ -521,7 +521,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
     // note: this normally never happens unless heaptags are actually used.
     // (an unown might free the page, and depending on that we can keep it in the abandoned map or not)
     // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point.
-    //       so we cannot check in `mi_arena_free` for this invariant to hold.
+    //       so we cannot check in `mi_arenas_free` for this invariant to hold.
     const bool freed = _mi_page_unown(page);
     *keep_abandoned = !freed;
     return false;
@@ -531,7 +531,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
   return true;
 }
 
-static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq)
+static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq)
 {
   MI_UNUSED(slice_count);
   const size_t bin = _mi_bin(block_size);
@@ -584,7 +584,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
 #endif
 
 // Allocate a fresh page
-static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
+static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
                                             mi_arena_t* req_arena, size_t tseq)
 {
   const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
@@ -697,18 +697,18 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
 }
 
 // Allocate a regular small/medium/large page.
-static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) {
+static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) {
   mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
 
   // 1. look for an abandoned page
-  mi_page_t* page = mi_arena_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq);
+  mi_page_t* page = mi_arenas_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq);
   if (page != NULL) {
     return page;  // return as abandoned
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq);
+  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -719,7 +719,7 @@ static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_coun
 }
 
 // Allocate a page containing one block (very large, or with large alignment)
-static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
+static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
   mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
@@ -730,7 +730,7 @@ static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_si
   const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE);
   #endif
 
-  mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
+  mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
   if (page == NULL) return NULL;
 
   mi_assert(page->reserved == 1);
@@ -740,23 +740,23 @@ static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_si
 }
 
 
-mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
+mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
   mi_page_t* page;
   if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
     mi_assert_internal(_mi_is_power_of_two(block_alignment));
-    page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment);
+    page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment);
   }
   else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
-    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+    page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
-    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+    page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
   else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
-    page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+    page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
   }
   else {
-    page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment);
+    page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment);
   }
   // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
@@ -767,7 +767,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
   return page;
 }
 
-void _mi_arena_page_free(mi_page_t* page) {
+void _mi_arenas_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -804,14 +804,14 @@ void _mi_arena_page_free(mi_page_t* page) {
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
   }
-  _mi_arena_free(page, mi_memid_size(page->memid), page->memid);
+  _mi_arenas_free(page, mi_memid_size(page->memid), page->memid);
 }
 
 /* -----------------------------------------------------------
   Arena abandon
 ----------------------------------------------------------- */
 
-void _mi_arena_page_abandon(mi_page_t* page) {
+void _mi_arenas_page_abandon(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -855,7 +855,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   _mi_page_unown(page);
 }
 
-bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
+bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -871,13 +871,13 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
     mi_subproc_t* subproc = _mi_subproc();
     mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1);
     mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
-    _mi_arena_page_abandon(page);
+    _mi_arenas_page_abandon(page);
     return true;
   }
 }
 
 // called from `mi_free` if trying to unabandon an abandoned page
-void _mi_arena_page_unabandon(mi_page_t* page) {
+void _mi_arenas_page_unabandon(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -917,12 +917,6 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
   }
 }
 
-void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
-  MI_UNUSED(heap);
-  // TODO: implement this
-  return;
-}
-
 
 /* -----------------------------------------------------------
   Arena free
@@ -930,7 +924,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
 static void mi_arenas_try_purge(bool force, bool visit_all);
 
-void _mi_arena_free(void* p, size_t size, mi_memid_t memid) {
+void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
   if (size==0) return;
 
@@ -1001,7 +995,7 @@ bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) {
 }
 
 // Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
+bool _mi_arenas_contain(const void* p) {
   mi_subproc_t* subproc = _mi_subproc();
   const size_t max_arena = mi_arenas_get_count(subproc);
   for (size_t i = 0; i < max_arena; i++) {
@@ -1043,7 +1037,7 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(void) {
+void _mi_arenas_unsafe_destroy_all(void) {
   mi_arenas_unsafe_destroy(_mi_subproc());
   _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
diff --git a/src/free.c b/src/free.c
index d08123a2..4d72cc7a 100644
--- a/src/free.c
+++ b/src/free.c
@@ -210,9 +210,9 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-      _mi_arena_page_unabandon(page);
+      _mi_arenas_page_unabandon(page);
     // we can free the page directly
-    _mi_arena_page_free(page);
+    _mi_arenas_page_free(page);
     return;
   }
 
@@ -240,7 +240,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
       {
         if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-          _mi_arena_page_unabandon(page);
+          _mi_arenas_page_unabandon(page);
           _mi_heap_page_reclaim(tagheap, page);
           mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
           return;
@@ -252,7 +252,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
   if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
     !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
-    _mi_arena_page_try_reabandon_to_mapped(page))
+    _mi_arenas_page_try_reabandon_to_mapped(page))
   {
     return;
   }
diff --git a/src/heap.c b/src/heap.c
index a1b06c6b..25ddf9b7 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -211,7 +211,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   else {
     // heaps associated wita a specific arena are allocated in that arena
     // note: takes up at least one slice which is quite wasteful...
-    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
   }
   if (heap==NULL) {
     _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
@@ -341,7 +341,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   page->next = NULL;
   page->prev = NULL;
   mi_page_set_heap(page, NULL);
-  _mi_arena_page_free(page);
+  _mi_arenas_page_free(page);
 
   return true; // keep going
 }
diff --git a/src/init.c b/src/init.c
index 5f3fb797..8233f8a3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -713,7 +713,7 @@ void mi_cdecl _mi_process_done(void) {
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
     mi_collect(true /* force */);
     _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all();
+    _mi_arenas_unsafe_destroy_all();
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
diff --git a/src/page.c b/src/page.c
index 6030161a..7c8429a9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -252,7 +252,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   else {
     mi_page_queue_remove(pq, page);
     mi_page_set_heap(page, NULL);
-    _mi_arena_page_abandon(page);
+    _mi_arenas_page_abandon(page);
   }
 }
 
@@ -264,7 +264,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
   mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment);
+  mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // out-of-memory
     return NULL;
@@ -357,7 +357,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
 
   // and free it
   mi_page_set_heap(page,NULL);
-  _mi_arena_page_free(page);
+  _mi_arenas_page_free(page);
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE

From e61ab67185f1c89c71dec7f7e4508bc8ed9c7f82 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 18:31:33 -0800
Subject: [PATCH 135/264] cleanup

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 src/arena.c                 | 46 ++++++++++++++++++++++---------------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index dc112272..87e866bb 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -190,7 +190,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/src/arena.c b/src/arena.c
index 7a016165..b5c17d95 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,6 +42,7 @@ typedef struct mi_arena_s {
   int                 numa_node;            // associated NUMA node
   bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
+  long                purge_delay;          // from the options, but allows setting per arena
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
@@ -793,6 +794,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
   #endif
 
   // recommit guard page at the end?
+  // we must do this since we may later allocate large spans over this page and cannot have a guard page in between
   #if MI_SECURE >= 2
   if (!page->memid.is_pinned) {
     _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL);
@@ -1047,7 +1049,7 @@ void _mi_arenas_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) {
+static bool mi_arenas_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = NULL; }
@@ -1089,7 +1091,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas
   const size_t size = base_size + bitmaps_size;
 
   const size_t os_page_size = _mi_os_page_size();
-  const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page
+  const size_t info_size = _mi_align_up(size, os_page_size) + MI_ARENA_GUARD_PAGE_SIZE;
   const size_t info_slices = mi_slice_count_of_size(info_size);
 
   if (bitmap_base != NULL) *bitmap_base = base_size;
@@ -1132,18 +1134,19 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
 
   mi_arena_t* arena = (mi_arena_t*)start;
 
-  // commit & zero if needed
-  const size_t os_page_size = _mi_os_page_size();  
+  // commit & zero if needed  
   if (!memid.initially_committed) {
-    // security: always leave a guard OS page decommitted at the end (already part of info_slices)
-    _mi_os_commit(arena, mi_size_of_slices(info_slices) - os_page_size, NULL);
+    // if MI_SECURE, leave a guard OS page decommitted at the end 
+    _mi_os_commit(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, NULL);
   }
   else if (!memid.is_pinned) {
-    // security: decommit a guard OS page at the end of the arena info
-    _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - os_page_size, os_page_size);
+    #if MI_SECURE > 0
+    // if MI_SECURE, decommit a guard OS page at the end of the arena info
+    _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE);
+    #endif  
   }
   if (!memid.initially_zero) {
-    _mi_memzero(arena, mi_size_of_slices(info_slices) - os_page_size);
+    _mi_memzero(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE);
   }
 
   // init
@@ -1155,6 +1158,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
+  arena->purge_delay  = mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult);
   // mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
@@ -1184,7 +1188,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(subproc, arena, arena_id);
+  return mi_arenas_add(subproc, arena, arena_id);
 }
 
 
@@ -1427,9 +1431,14 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   Arena purge
 ----------------------------------------------------------- */
 
-static long mi_arena_purge_delay(void) {
+static long mi_arena_purge_delay(mi_arena_t* arena) {
   // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+  if (arena==NULL) {
+    return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+  }
+  else {
+    return arena->purge_delay;
+  }
 }
 
 // reset or decommit in an arena and update the commit bitmap
@@ -1459,8 +1468,8 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
+  const long delay = mi_arena_purge_delay(arena);
+  if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
 
   mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
   if (delay == 0) {
@@ -1542,7 +1551,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   // this also clears those ranges atomically (so any newly freed blocks will get purged next
   // time around)
-  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
+  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(arena), true /*all?*/, false /*any?*/};
   _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
 
   return vinfo.any_purged;
@@ -1551,7 +1560,8 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 
 static void mi_arenas_try_purge(bool force, bool visit_all)
 {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+  const long delay = mi_arena_purge_delay(NULL);
+  if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
 
   // check if any arena needs purging?
   mi_tld_t* tld = _mi_tld();
@@ -1568,7 +1578,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all)
   mi_atomic_guard(&purge_guard)
   {
     // increase global expire: at most one purge per delay cycle
-    mi_atomic_store_release(&subproc->purge_expire, now + mi_arena_purge_delay());
+    mi_atomic_store_release(&subproc->purge_expire, now + delay);
     const size_t arena_start = tld->thread_seq % max_arena;
     size_t max_purge_count = (visit_all ? max_arena : 2);
     bool all_visited = true;
@@ -1688,7 +1698,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* are
 
   arena->is_exclusive = true;
   arena->subproc = _mi_subproc();
-  if (!mi_arena_add(arena->subproc, arena, arena_id)) {
+  if (!mi_arenas_add(arena->subproc, arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);

From f605cb73e524cbfcab36c86cc351a6310640a3fb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 18:33:44 -0800
Subject: [PATCH 136/264] old purge delay

---
 src/arena.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index b5c17d95..4926e667 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,7 +42,6 @@ typedef struct mi_arena_s {
   int                 numa_node;            // associated NUMA node
   bool                is_exclusive;         // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
-  long                purge_delay;          // from the options, but allows setting per arena
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
@@ -1158,7 +1157,6 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
   arena->purge_expire = 0;
-  arena->purge_delay  = mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult);
   // mi_lock_init(&arena->abandoned_visit_lock);
 
   // init bitmaps
@@ -1431,14 +1429,9 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   Arena purge
 ----------------------------------------------------------- */
 
-static long mi_arena_purge_delay(mi_arena_t* arena) {
+static long mi_arena_purge_delay(void) {
   // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  if (arena==NULL) {
-    return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-  }
-  else {
-    return arena->purge_delay;
-  }
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));  
 }
 
 // reset or decommit in an arena and update the commit bitmap
@@ -1468,7 +1461,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
-  const long delay = mi_arena_purge_delay(arena);
+  const long delay = mi_arena_purge_delay();
   if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
 
   mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
@@ -1551,7 +1544,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   // this also clears those ranges atomically (so any newly freed blocks will get purged next
   // time around)
-  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(arena), true /*all?*/, false /*any?*/};
+  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
   _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo);
 
   return vinfo.any_purged;
@@ -1560,7 +1553,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 
 static void mi_arenas_try_purge(bool force, bool visit_all)
 {
-  const long delay = mi_arena_purge_delay(NULL);
+  const long delay = mi_arena_purge_delay();
   if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
 
   // check if any arena needs purging?

From 8d2b7b0383a6ed10b02881531b3e7e25f6c68a38 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 18:34:39 -0800
Subject: [PATCH 137/264] merge from dev3

---
 src/arena.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index ca22c47c..0f6388a9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -176,10 +176,6 @@ static size_t mi_memid_size(mi_memid_t memid) {
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
 
 static mi_decl_noinline void* mi_arena_try_alloc_at(
   mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)

From dd1b37c9f8dc0d712b9b32bc88ef40bdb71e46a9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 21:03:03 -0800
Subject: [PATCH 138/264] fix recursive tls access on macOS <= 14

---
 include/mimalloc/internal.h |  5 ++---
 src/arena.c                 | 15 +++++++--------
 src/heap.c                  | 14 +++++++-------
 src/init.c                  | 24 +++++++++++++++---------
 src/page.c                  |  4 +++-
 src/stats.c                 | 14 +++++++-------
 6 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 041e7653..e98a37f5 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -101,7 +101,6 @@ size_t        _mi_current_thread_count(void);
 bool          _mi_preloading(void);           // true while the C runtime is not initialized yet
 void          _mi_thread_done(mi_heap_t* heap);
 
-mi_tld_t*     _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 mi_subproc_t* _mi_subproc(void);
 mi_subproc_t* _mi_subproc_main(void);
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
@@ -148,8 +147,8 @@ void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit,
 void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
 bool          _mi_arenas_contain(const void* p);
-void          _mi_arenas_collect(bool force_purge);
-void          _mi_arenas_unsafe_destroy_all(void);
+void          _mi_arenas_collect(bool force_purge, mi_tld_t* tld);
+void          _mi_arenas_unsafe_destroy_all(mi_tld_t* tld);
 
 mi_page_t*    _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
 void          _mi_arenas_page_free(mi_page_t* page);
diff --git a/src/arena.c b/src/arena.c
index 4926e667..88524ea2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -923,7 +923,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) {
   Arena free
 ----------------------------------------------------------- */
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
-static void mi_arenas_try_purge(bool force, bool visit_all);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld);
 
 void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
@@ -979,12 +979,12 @@ void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
   }
 
   // try to purge expired decommits
-  mi_arenas_try_purge(false, false);
+  // mi_arenas_try_purge(false, false, NULL);
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
+void _mi_arenas_collect(bool force_purge, mi_tld_t* tld) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, tld);
 }
 
 
@@ -1038,9 +1038,9 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arenas_unsafe_destroy_all(void) {
+void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld) {
   mi_arenas_unsafe_destroy(_mi_subproc());
-  _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */, tld);  // purge non-owned arenas
 }
 
 
@@ -1551,13 +1551,12 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 }
 
 
-static void mi_arenas_try_purge(bool force, bool visit_all)
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
 {
   const long delay = mi_arena_purge_delay();
   if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
 
   // check if any arena needs purging?
-  mi_tld_t* tld = _mi_tld();
   mi_subproc_t* subproc = tld->subproc;
   const mi_msecs_t now = _mi_clock_now();
   mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire);
diff --git a/src/heap.c b/src/heap.c
index 25ddf9b7..6632861b 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -120,7 +120,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
 
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, heap->tld);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -204,7 +204,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   mi_assert(heap_tag >= 0 && heap_tag < 256);
   // allocate and initialize a heap
   mi_memid_t memid;
-  mi_heap_t* heap; 
+  mi_heap_t* heap;
   if (arena_id == _mi_arena_id_none()) {
     heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
   }
@@ -444,7 +444,7 @@ void mi_heap_delete(mi_heap_t* heap)
 
   // abandon all pages
   _mi_heap_collect_abandon(heap);
-  
+
   mi_assert_internal(heap->page_count==0);
   mi_heap_free(heap,true);
 }
@@ -471,7 +471,7 @@ void mi_heap_unload(mi_heap_t* heap) {
     _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
     return;
   }
-  
+
   // abandon all pages so all thread'id in the pages are cleared
   _mi_heap_collect_abandon(heap);
   mi_assert_internal(heap->page_count==0);
@@ -485,7 +485,7 @@ void mi_heap_unload(mi_heap_t* heap) {
 }
 
 bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
-  mi_assert(mi_heap_is_initialized(heap));  
+  mi_assert(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
   if (heap->exclusive_arena == NULL) {
     _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
@@ -503,8 +503,8 @@ bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
 
   mi_assert_internal(heap->page_count==0);
 
-  // re-associate from the current thread-local and static state
-  heap->tld = _mi_tld();
+  // re-associate with the current thread-local and static state
+  heap->tld = mi_heap_get_default()->tld;
 
   // reinit direct pages (as we may be in a different process)
   mi_assert_internal(heap->page_count == 0);
diff --git a/src/init.c b/src/init.c
index 8233f8a3..5240611c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -309,17 +309,21 @@ static mi_tld_t* mi_tld_alloc(void) {
 
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
-mi_decl_noinline static void mi_tld_free(void) {
-  mi_tld_t* tld = _mi_tld();
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
   if (tld != NULL && tld != MI_TLD_INVALID) {
     _mi_stats_done(&tld->stats);
     _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
   }
-  tld = MI_TLD_INVALID;
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
   mi_atomic_decrement_relaxed(&thread_count);
 }
 
-mi_decl_noinline mi_tld_t* _mi_tld(void) {
+static mi_tld_t* mi_tld(void) {
   mi_tld_t* tld = thread_tld;
   if (tld == MI_TLD_INVALID) {
     _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
@@ -337,11 +341,11 @@ mi_subproc_t* _mi_subproc(void) {
   //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
   //       stored in a TLS slot for example)
   mi_heap_t* heap = mi_prim_get_default_heap();
-  if (heap == NULL || heap == &_mi_heap_empty) {
+  if (heap == NULL) {
     return _mi_subproc_main();
   }
   else {
-    return thread_tld->subproc;  // don't call `_mi_tld()`
+    return heap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
   }
 }
 
@@ -395,7 +399,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_tld_t* tld = _mi_tld();
+  mi_tld_t* tld = mi_tld();
   if (tld == NULL) return;
   mi_assert(tld->subproc == &subproc_main);
   if (tld->subproc != &subproc_main) return;
@@ -553,10 +557,12 @@ void _mi_thread_done(mi_heap_t* heap)
   if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* tld = heap->tld;
   _mi_thread_heap_done(heap);  // returns true if already ran
 
   // free thread local data
-  mi_tld_free();
+  mi_tld_free(tld);
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@@ -713,7 +719,7 @@ void mi_cdecl _mi_process_done(void) {
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
     mi_collect(true /* force */);
     _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arenas_unsafe_destroy_all();
+    _mi_arenas_unsafe_destroy_all(&tld_main);
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
diff --git a/src/page.c b/src/page.c
index 7c8429a9..239d5d6e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -252,7 +252,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   else {
     mi_page_queue_remove(pq, page);
     mi_page_set_heap(page, NULL);
-    _mi_arenas_page_abandon(page);
+    _mi_arenas_page_abandon(page);    
   }
 }
 
@@ -356,8 +356,10 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
   mi_page_queue_remove(pq, page);
 
   // and free it
+  mi_heap_t* heap = page->heap;
   mi_page_set_heap(page,NULL);
   _mi_arenas_page_free(page);
+  _mi_arenas_collect(false, heap->tld);  // allow purging
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
diff --git a/src/stats.c b/src/stats.c
index 102373ec..057dc093 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -47,11 +47,11 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
 
 
 // Adjust stats to compensate; for example before committing a range,
-// first adjust downwards with parts that were already committed so 
+// first adjust downwards with parts that were already committed so
 // we avoid double counting.
 static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
-  // adjust atomically 
+  // adjust atomically
   mi_atomic_addi64_relaxed(&stat->current, amount);
   mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
 }
@@ -74,7 +74,7 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
 
 void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
   stat->count++;
-  stat->total += amount;  
+  stat->total += amount;
 }
 
 void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
@@ -150,7 +150,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
   mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
@@ -347,7 +347,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   #if MI_STAT
   mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
   mi_stat_count_t total = { 0,0,0,0 };
   mi_stat_add(&total, &stats->normal, 1);
   mi_stat_add(&total, &stats->huge, 1);
@@ -408,7 +408,7 @@ static mi_msecs_t mi_process_start; // = 0
 
 // return thread local stats
 static mi_stats_t* mi_get_tld_stats(void) {
-  return &_mi_tld()->stats;
+  return &mi_heap_get_default()->tld->stats;
 }
 
 void mi_stats_reset(void) mi_attr_noexcept {
@@ -492,7 +492,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   pinfo.page_faults    = 0;
 
   _mi_prim_process_info(&pinfo);
-  
+
   if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
   if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
   if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));

From 04970f43e5d45fe18e868a020db58aabe2180f3c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 21:55:40 -0800
Subject: [PATCH 139/264] document way to use a TLS slot on windows

---
 include/mimalloc/prim.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 99791585..2d681062 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -207,6 +207,20 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 }
 
+#elif 0 && _MSC_VER && _WIN32
+// On Windows, using a fixed TLS slot has better codegen than a thread-local 
+// but it might clash with an application trying to use the same slot. (so we disable this by default)
+#include <winternl.h>
+
+#define MI_HAS_TLS_SLOT
+#define MI_TLS_SLOT       63  // last available slot
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  return NtCurrentTeb()->TlsSlots[slot];
+}
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  NtCurrentTeb()->TlsSlots[slot] = value;
+}
 #endif
 
 // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id

From bc5ae316493d58047d22df2dcd4689d7c4a82246 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 22 Dec 2024 22:31:16 -0800
Subject: [PATCH 140/264] add abandoned_visit_blocks

---
 src/arena.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 88524ea2..00ff3720 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -352,6 +352,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
     mi_arena_t* name_arena; \
     if (req_arena != NULL) { \
       name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
+      if (_i > 0) break;       /* only once */ \
     } \
     else { \
       size_t _idx; \
@@ -369,7 +370,6 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
 
 #define mi_forall_arenas_end()  \
     } \
-    if (req_arena != NULL) break; \
   } \
   }
 
@@ -1594,10 +1594,71 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
   }
 }
 
+/* -----------------------------------------------------------
+  Visit abandoned pages
+----------------------------------------------------------- */
+
+typedef struct mi_abandoned_page_visit_info_s {
+  int heap_tag;
+  mi_block_visit_fun* visitor;
+  void* arg;
+  bool visit_blocks;
+} mi_abandoned_page_visit_info_t;
+
+static bool abandoned_page_visit(mi_page_t* page, mi_abandoned_page_visit_info_t* vinfo) {
+  if (page->heap_tag != vinfo->heap_tag) { return true; } // continue
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { 
+    return false; 
+  }
+  if (vinfo->visit_blocks) {
+    return _mi_heap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+static bool abandoned_page_visit_at(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(slice_count);
+  mi_abandoned_page_visit_info_t* vinfo = (mi_abandoned_page_visit_info_t*)arg;
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  return abandoned_page_visit(page, vinfo); 
+}
+
+// Visit all abandoned pages in this subproc.
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_abandoned_page_visit_info_t visit_info = { heap_tag, visitor, arg, visit_blocks };
   MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
-  _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n");
-  return false;
+
+  // visit abandoned pages in the arenas
+  // we don't have to claim because we assume we are the only thread running (in this subproc).
+  // (but we could atomically claim as well by first doing abandoned_reclaim and afterwards reabandoning).
+  bool ok = true;
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  mi_forall_arenas(subproc, NULL, 0, arena) {
+    mi_assert_internal(arena->subproc == subproc);
+    for (size_t bin = 0; ok && bin < MI_BIN_COUNT; bin++) {
+      // todo: if we had a single abandoned page map as well, this can be faster.
+      if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) > 0) {
+        ok = _mi_bitmap_forall_set(arena->pages_abandoned[bin], &abandoned_page_visit_at, arena, &visit_info);
+      }
+    }
+  }
+  mi_forall_arenas_end();
+  if (!ok) return false;
+
+  // visit abandoned pages in OS allocated memory
+  // (technically we don't need the lock as we assume we are the only thread running in this subproc)
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    for (mi_page_t* page = subproc->os_abandoned_pages; ok && page != NULL; page = page->next) {
+      ok = abandoned_page_visit(page, &visit_info);
+    }
+  }
+
+  return ok;
 }
 
 
@@ -1697,3 +1758,4 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* are
   return true;
 }
 
+

From 657135de36edad2082323426aea3e2fa1a9cf19a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 09:53:52 -0800
Subject: [PATCH 141/264] commit 2level page-map on over-commit systems

---
 CMakeLists.txt              | 18 +++++++++++-------
 include/mimalloc/internal.h | 26 ++++++++++++--------------
 src/options.c               |  2 +-
 src/page-map.c              |  3 ++-
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07a292e0..c184a0b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,9 @@ option(MI_PADDING           "Enable padding to detect heap block overflow (alway
 option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
-option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
-option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
-option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
+
 option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
 option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
@@ -21,14 +20,19 @@ option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS"
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
 option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
-option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
+option(MI_LIBC_MUSL         "Enable this when linking with musl libc" OFF)
+
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
+option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
+option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
-option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
-option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
+
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e98a37f5..4cb54d6f 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -435,13 +435,14 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 
 
 /* -----------------------------------------------------------
-  Pages
+  The page map maps addresses to `mi_page_t` pointers
 ----------------------------------------------------------- */
 
 #if MI_PAGE_MAP_FLAT
 
-// flat page-map committed on demand
+// flat page-map committed on demand, using one byte per slice (64 KiB).
 // single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses)
+// used by default on <= 40 bit virtual address spaces.
 extern uint8_t* _mi_page_map;
 
 static inline size_t _mi_page_map_index(const void* p) {
@@ -468,26 +469,23 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
 #else
 
 // 2-level page map:
-// double indirection but low commit and low virtual reserve.
-// 
-// The page-map is usually 4 MiB and points to sub maps of 64 KiB. 
-// The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well)
-// One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space
-// The page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size.
-// (Choosing a MI_PAGE_MAP_SUB_SHIFT of 16 gives slightly better code but will commit the initial sub-map at 512 KiB)
-
+// double indirection, but low commit and low virtual reserve.
+//
+// the page-map is usually 4 MiB and points to sub maps of 64 KiB.
+// the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well)
+// one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space
+// the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 4 MiB size.
 #define MI_PAGE_MAP_SUB_SHIFT     (13)
 #define MI_PAGE_MAP_SUB_COUNT     (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
-
 #define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
 #define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
 
 extern mi_page_t*** _mi_page_map;
 
 static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
-  const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE;
-  if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_COUNT; }
-  return (size_t)(u / MI_PAGE_MAP_SUB_COUNT);
+  const size_t u = (size_t)((uintptr_t)p / MI_ARENA_SLICE_SIZE);
+  if (sub_idx != NULL) { *sub_idx = u % MI_PAGE_MAP_SUB_COUNT; }
+  return (u / MI_PAGE_MAP_SUB_COUNT);
 }
 
 static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
diff --git a/src/options.c b/src/options.c
index fc3a2838..7562cd46 100644
--- a/src/options.c
+++ b/src/options.c
@@ -103,7 +103,7 @@ typedef struct mi_option_desc_s {
 #endif
 
 #ifndef MI_DEFAULT_PAGEMAP_COMMIT
-#if defined(__APPLE__)
+#if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
 #define MI_DEFAULT_PAGEMAP_COMMIT 1
 #else
 #define MI_DEFAULT_PAGEMAP_COMMIT 0
diff --git a/src/page-map.c b/src/page-map.c
index 37ce3082..db14265b 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -187,7 +187,8 @@ bool _mi_page_map_init(void) {
   const size_t os_page_size = _mi_os_page_size();
   const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
   const size_t reserve_size = page_map_size + os_page_size;
-  const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_pagemap_commit); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const bool commit = page_map_size <= 64*MI_KiB || 
+                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit(); 
   _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
   if (_mi_page_map==NULL) {
     _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);

From 88d8ee964f818b09ccd56c078b90851c78cd9af2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 15:04:06 -0800
Subject: [PATCH 142/264] remove is_large member (and use is_pinned for this)

---
 doc/mimalloc-doc.h          |  7 +++----
 include/mimalloc.h          |  4 ++--
 include/mimalloc/internal.h |  4 ++--
 src/arena.c                 | 23 ++++++++++-------------
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index e1c14b44..e9da9b90 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -431,12 +431,11 @@ int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large);
 /// @param start       Start of the memory area
 /// @param size        The size of the memory area.
 /// @param is_committed Is the area already committed?
-/// @param is_large    Does it consist of large OS pages? Set this to \a true as well for memory
-///                    that should not be decommitted or protected (like rdma etc.)
+/// @param is_pinned   Can the memory not be decommitted or reset? (usually the case for large OS pages)
 /// @param is_zero     Does the area consists of zero's?
 /// @param numa_node   Possible associated numa node or `-1`.
 /// @return \a true if successful, and \a false on error.
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node);
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node);
 
 /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes,
 /// but stops after at most `timeout_msecs` seconds.
@@ -589,7 +588,7 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc);
 
 /// Allocate \a size bytes aligned by \a alignment.
 /// @param size  number of bytes to allocate.
-/// @param alignment  the minimal alignment of the allocated memory.       
+/// @param alignment  the minimal alignment of the allocated memory.
 /// @returns pointer to the allocated memory or \a NULL if out of memory,
 /// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted
 /// (and does not have to be an integral multiple of the \a alignment).
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 8bff8923..508e6aec 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -274,7 +274,7 @@ mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
 mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept;
 
@@ -283,7 +283,7 @@ typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 
 #if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 4cb54d6f..281f531a 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -143,8 +143,8 @@ mi_arena_id_t _mi_arena_id_none(void);
 mi_arena_t*   _mi_arena_from_id(mi_arena_id_t id);
 bool          _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 
-void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
-void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
 bool          _mi_arenas_contain(const void* p);
 void          _mi_arenas_collect(bool force_purge, mi_tld_t* tld);
diff --git a/src/arena.c b/src/arena.c
index 00ff3720..7b97fbbc 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -41,7 +41,6 @@ typedef struct mi_arena_s {
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
   int                 numa_node;            // associated NUMA node
   bool                is_exclusive;         // only allow allocations if specifically for this arena
-  bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
 
   mi_bitmap_t*        slices_free;          // is the slice free?
@@ -333,8 +332,8 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   Arena iteration
 ----------------------------------------------------------- */
 
-static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_large) {
-  if (!allow_large && arena->is_large) return false;
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_pinned) {
+  if (!allow_pinned && arena->memid.is_pinned) return false;
   if (!mi_arena_id_is_suitable(arena, req_arena)) return false;
   if (req_arena == NULL) { // if not specific, check numa affinity
     const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
@@ -1104,7 +1103,7 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
 }
 
 
-static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
@@ -1154,8 +1153,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
   arena->is_exclusive = exclusive;
   arena->slice_count  = slice_count;
   arena->info_slices  = info_slices;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)  
   arena->purge_expire = 0;
   // mi_lock_init(&arena->abandoned_visit_lock);
 
@@ -1190,14 +1188,14 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
 }
 
 
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
   mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
   memid.mem.os.base = start;
   memid.mem.os.size = size;
   memid.initially_committed = is_committed;
   memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, numa_node, exclusive, memid, arena_id);
+  memid.is_pinned = is_pinned;
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, arena_id);
 }
 
 // Reserve a range of regular OS memory
@@ -1207,13 +1205,12 @@ static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool com
   mi_memid_t memid;
   void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(subproc, start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(subproc, start, size, -1 /* numa node */, exclusive, memid, arena_id)) {
     _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), memid.is_pinned ? " (in large os pages)" : "");
   // mi_debug_show_arenas(true, true, false);
 
   return 0;
@@ -1373,7 +1370,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   }
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, numa_node, exclusive, memid, arena_id)) {
     _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }

From b515a0ad4c58f1e264213f22998c628470746bc1 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 16:28:34 -0800
Subject: [PATCH 143/264] add _mi_os_guard_page_size

---
 include/mimalloc/internal.h |  8 ++++++
 include/mimalloc/types.h    |  7 +++--
 src/arena-meta.c            | 26 +++++++-----------
 src/arena.c                 | 38 +++++++++++---------------
 src/os.c                    | 54 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 281f531a..7c49d590 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -116,6 +116,7 @@ void          _mi_os_free(void* p, size_t size, mi_memid_t memid);
 void          _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
 
 size_t        _mi_os_page_size(void);
+size_t        _mi_os_guard_page_size(void);
 size_t        _mi_os_good_alloc_size(size_t size);
 bool          _mi_os_has_overcommit(void);
 bool          _mi_os_has_virtual_reserve(void);
@@ -129,6 +130,13 @@ bool          _mi_os_unprotect(void* addr, size_t size);
 bool          _mi_os_purge(void* p, size_t size);
 bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset);
 
+size_t        _mi_os_secure_guard_page_size(void);
+bool          _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned);
+bool          _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned);
+bool          _mi_os_secure_guard_page_reset_at(void* addr);
+bool          _mi_os_secure_guard_page_reset_before(void* addr);
+
+
 void*         _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
 void*         _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 84179458..c2ce4a26 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -46,8 +46,12 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
 
-// Define MI_SECURE to enable security mitigations. The lowest two have minimal performance impact:
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
+// but protects most metadata with guard pages:
 //   #define MI_SECURE 1  // guard page around metadata
+// 
+// Level 2 has more performance impact but protect well against various buffer overflows 
+// by surrounding all mimalloc pages with guard pages:
 //   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large heaps..)
 // 
 // The next two levels can have more performance cost:
@@ -126,7 +130,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
 #define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bitmap)
 
-
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
 #define MI_BIN_FULL  (MI_BIN_HUGE+1)
diff --git a/src/arena-meta.c b/src/arena-meta.c
index 34be6e0e..c8c0cac6 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -25,12 +25,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
 #define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
 
-#if MI_SECURE 
-#define MI_META_PAGE_GUARD_SIZE   (4*MI_KiB)
-#else
-#define MI_META_PAGE_GUARD_SIZE   (0)
-#endif
-
 #define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
 #define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
 #define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
@@ -47,7 +41,7 @@ static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_V
 
 #if MI_DEBUG > 1
 static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
-  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + MI_META_PAGE_GUARD_SIZE);
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
   if (block_idx != NULL) {
     *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
   }
@@ -60,9 +54,9 @@ static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
 }
 
 static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
-  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_ALIGN));
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
   mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
-  void* p = ((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE + (block_idx * MI_META_BLOCK_SIZE));
+  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
   mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
   return p;
 }
@@ -82,20 +76,18 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) {
   }
 
   // guard pages
-  #if MI_SECURE 
-  if (!memid.is_pinned) {
-    _mi_os_decommit(base, MI_META_PAGE_GUARD_SIZE);
-    _mi_os_decommit(base + MI_META_PAGE_SIZE - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_GUARD_SIZE);
-  }
+  #if MI_SECURE >= 1
+  _mi_os_secure_guard_page_set_at(base, memid.is_pinned);
+  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid.is_pinned);
   #endif
-
+  
   // initialize the page and free block bitmap
-  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + MI_META_PAGE_GUARD_SIZE);
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
   mpage->memid = memid;
   mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
   const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
   const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
-  const size_t guard_blocks = _mi_divide_up(MI_META_PAGE_GUARD_SIZE, MI_META_BLOCK_SIZE);
+  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
   mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);  
   mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
 
diff --git a/src/arena.c b/src/arena.c
index 7b97fbbc..3349abb1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -576,12 +576,6 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
   return NULL;
 }
 
-#if MI_SECURE < 2
-#define MI_ARENA_GUARD_PAGE_SIZE  (0)
-#else
-#define MI_ARENA_GUARD_PAGE_SIZE  (4*MI_KiB)
-#endif
-
 // Allocate a fresh page
 static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
                                             mi_arena_t* req_arena, size_t tseq)
@@ -621,11 +615,14 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
-  // guard page at the end
-  const size_t page_noguard_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE;
-  #if MI_SECURE >= 2
-  if (memid.initially_committed && !memid.is_pinned) {
-    _mi_os_decommit((uint8_t*)page + page_noguard_size, MI_ARENA_GUARD_PAGE_SIZE);
+  // guard page at the end of mimalloc page?
+  #if MI_SECURE < 2
+  const size_t page_noguard_size = mi_size_of_slices(slice_count);
+  #else
+  mi_assert(mi_size_of_slices(slice_count) > _mi_os_secure_guard_page_size());
+  const size_t page_noguard_size = mi_size_of_slices(slice_count) - _mi_os_secure_guard_page_size();
+  if (memid.initially_committed) {
+    _mi_os_secure_guard_page_set_at((uint8_t*)page + page_noguard_size, memid.is_pinned);
   }
   #endif
 
@@ -795,7 +792,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
   // we must do this since we may later allocate large spans over this page and cannot have a guard page in between
   #if MI_SECURE >= 2
   if (!page->memid.is_pinned) {
-    _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL);
+    _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_memid_size(page->memid));
   }
   #endif
 
@@ -1089,7 +1086,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas
   const size_t size = base_size + bitmaps_size;
 
   const size_t os_page_size = _mi_os_page_size();
-  const size_t info_size = _mi_align_up(size, os_page_size) + MI_ARENA_GUARD_PAGE_SIZE;
+  const size_t info_size = _mi_align_up(size, os_page_size) + _mi_os_secure_guard_page_size();
   const size_t info_slices = mi_slice_count_of_size(info_size);
 
   if (bitmap_base != NULL) *bitmap_base = base_size;
@@ -1105,7 +1102,6 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
 
 static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
-  mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
   mi_assert(start!=NULL);
   if (start==NULL) return false;
@@ -1134,17 +1130,15 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
 
   // commit & zero if needed  
   if (!memid.initially_committed) {
-    // if MI_SECURE, leave a guard OS page decommitted at the end 
-    _mi_os_commit(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, NULL);
+    // leave a guard OS page decommitted at the end 
+    _mi_os_commit(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size(), NULL);
   }
-  else if (!memid.is_pinned) {
-    #if MI_SECURE > 0
-    // if MI_SECURE, decommit a guard OS page at the end of the arena info
-    _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE);
-    #endif  
+  else {
+    // if MI_SECURE, set a guard page at the end
+    _mi_os_secure_guard_page_set_before((uint8_t*)arena + mi_size_of_slices(info_slices), memid.is_pinned);
   }
   if (!memid.initially_zero) {
-    _mi_memzero(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE);
+    _mi_memzero(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size());
   }
 
   // init
diff --git a/src/os.c b/src/os.c
index 80d44d12..399aac6c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -61,8 +61,16 @@ size_t _mi_os_large_page_size(void) {
   return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
+size_t _mi_os_guard_page_size(void) {
+  const size_t gsize = _mi_os_page_size();
+  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/8));
+  return gsize;
+}
+
 size_t _mi_os_virtual_address_bits(void) {
-  return mi_os_mem_config.virtual_address_bits;
+  const size_t vbits = mi_os_mem_config.virtual_address_bits;
+  mi_assert(vbits <= MI_MAX_VABITS);
+  return vbits;
 }
 
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
@@ -99,6 +107,50 @@ void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   return NULL;
 }
 
+// In secure mode, return the size of a guard page, otherwise 0
+size_t _mi_os_secure_guard_page_size(void) {
+  #if MI_SECURE > 0
+  return _mi_os_guard_page_size();
+  #else
+  return 0;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails. 
+bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  const bool ok = (is_pinned ? false : _mi_os_decommit(addr, _mi_os_secure_guard_page_size()));
+  if (!ok) {
+    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
+  }
+  return ok;
+  #else
+  MI_UNUSED(is_pinned);
+  return true;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails. 
+bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned) {
+  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), is_pinned);
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_at(void* addr) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
+  #else
+  return true;
+  #endif
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_before(void* addr) {
+  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size());
+}
+
 
 /* -----------------------------------------------------------
   Free memory

From c65c6d83bd0a1c3d00bcbe8ce4fc1bc10ddc947e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 16:31:42 -0800
Subject: [PATCH 144/264] fix guard page size

---
 ide/vs2022/mimalloc.vcxproj | 2 +-
 src/arena.c                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 87e866bb..63bc7d1d 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -190,7 +190,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;MI_SECURE=4;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/src/arena.c b/src/arena.c
index 3349abb1..9ae44d85 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -720,10 +720,10 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
-  #if MI_ARENA_GUARD_PAGE_SIZE == 0
+  #if MI_SECURE < 2
   const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
   #else
-  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE);
+  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
   #endif
 
   mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);

From 9bad269c518a4104ac13584bc9474e0e357efd1c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 16:47:01 -0800
Subject: [PATCH 145/264] fix purge delay check for arenas

---
 src/arena.c   | 2 +-
 src/options.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 9ae44d85..af0d1d0a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1551,7 +1551,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
   mi_subproc_t* subproc = tld->subproc;
   const mi_msecs_t now = _mi_clock_now();
   mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire);
-  if (!force && (arenas_expire == 0 || arenas_expire < now)) return;
+  if (!force && (arenas_expire == 0 || arenas_expire > now)) return;
 
   const size_t max_arena = mi_arenas_get_count(subproc);
   if (max_arena == 0) return;
diff --git a/src/options.c b/src/options.c
index 7562cd46..63d8a68f 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 500, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose

From b77b34df968d610d7d26b0671f4375a072b39943 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 17:10:34 -0800
Subject: [PATCH 146/264] double arena per 4; large page objects 1/8 of large
 page size

---
 include/mimalloc/types.h | 2 +-
 src/arena.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 72c8d0a7..53c543d0 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -324,7 +324,7 @@ typedef struct mi_page_s {
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 11 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 128 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 1 MiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 1 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index b2113ec0..bc88acf3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -280,7 +280,7 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
 
   if (arena_count >= 1 && arena_count <= 128) {
     // scale up the arena sizes exponentially every 4 entries
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/2, 0, 16);
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;

From 9a7c0d443a0e04f2610044ffb4bdfa752ada8864 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 17:15:13 -0800
Subject: [PATCH 147/264] max obj size 1/8 of a page

---
 include/mimalloc/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index c2ce4a26..b21d0970 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -322,9 +322,9 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 8 KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 512 KiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 

From ba68810333e74dbb0fd32becc92ef8cabc0f5c3b Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 23 Dec 2024 18:33:37 -0800
Subject: [PATCH 148/264] commit page on demand

---
 ide/vs2022/mimalloc.vcxproj |  2 +-
 include/mimalloc.h          |  1 +
 include/mimalloc/types.h    |  7 ++--
 src/arena.c                 | 64 ++++++++++++++++++++++++++-----------
 src/init.c                  |  1 +
 src/options.c               |  3 +-
 src/page.c                  | 16 ++++++++--
 7 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 63bc7d1d..87e866bb 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -190,7 +190,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;MI_SECURE=4;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 508e6aec..5f856411 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -400,6 +400,7 @@ typedef enum mi_option_e {
   mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4)
   mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
   mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
+  mi_option_page_commit_on_demand,      // commit page memory on-demand
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index b21d0970..a4e158d6 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -139,6 +139,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE
+
 // ------------------------------------------------------
 // Arena's are large reserved areas of memory allocated from
 // the OS that are managed by mimalloc to efficiently
@@ -290,7 +292,7 @@ typedef struct mi_page_s {
   _Atomic(mi_page_flags_t)  xflags;            // `in_full_queue` and `has_aligned` flags
 
   size_t                    block_size;        // size available in each block (always `>0`)
-  uint8_t*                  page_start;        // start of the blocks
+  uint8_t*                  page_start;        // start of the blocks  
   mi_heaptag_t              heap_tag;          // tag of the owning heap, used to separate heaps by object type
   bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
                                                // padding
@@ -301,6 +303,7 @@ typedef struct mi_page_s {
   mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
+  size_t                    page_committed;    // committed size relative to `page_start`. 
   mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
@@ -324,7 +327,7 @@ typedef struct mi_page_s {
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 512 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 512 KiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index af0d1d0a..c31f1fe3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -562,7 +562,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
       mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
       mi_assert_internal(_mi_ptr_page(page)==page);
@@ -578,16 +578,16 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
 
 // Allocate a fresh page
 static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
-                                            mi_arena_t* req_arena, size_t tseq)
+                                            mi_arena_t* req_arena, size_t tseq, bool commit)
 {
   const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
-  const bool commit = true;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
 
   // try to allocate from free space in arena's
   mi_memid_t memid = _mi_memid_none();
   mi_page_t* page = NULL;
+  const size_t alloc_size = mi_size_of_slices(slice_count);
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
@@ -604,10 +604,10 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
     if (os_align) {
       // note: slice_count already includes the page
       mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid);
     }
     else {
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid);
     }
   }
 
@@ -617,25 +617,25 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
 
   // guard page at the end of mimalloc page?
   #if MI_SECURE < 2
-  const size_t page_noguard_size = mi_size_of_slices(slice_count);
+  const size_t page_noguard_size = alloc_size;
   #else
-  mi_assert(mi_size_of_slices(slice_count) > _mi_os_secure_guard_page_size());
-  const size_t page_noguard_size = mi_size_of_slices(slice_count) - _mi_os_secure_guard_page_size();
+  mi_assert(alloc_size > _mi_os_secure_guard_page_size());
+  const size_t page_noguard_size = alloc_size - _mi_os_secure_guard_page_size();
   if (memid.initially_committed) {
     _mi_os_secure_guard_page_set_at((uint8_t*)page + page_noguard_size, memid.is_pinned);
   }
   #endif
 
   // claimed free slices: initialize the page partly
-  if (!memid.initially_zero) {
+  if (!memid.initially_zero && memid.initially_committed) {
     mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE);
     _mi_memzero_aligned(page, sizeof(*page));
   }
-  else {
+  else if (memid.initially_committed) {
     mi_track_mem_defined(page, slice_count * MI_ARENA_SLICE_SIZE);
   }
   #if MI_DEBUG > 1
-  if (memid.initially_zero) {
+  if (memid.initially_zero && memid.initially_committed) {
     if (!mi_mem_is_zero(page, page_noguard_size)) {
       _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
       memid.initially_zero = false;
@@ -644,6 +644,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
   }
   #endif
   mi_assert(MI_PAGE_INFO_SIZE >= mi_page_info_size());
+
   size_t block_start;
   #if MI_GUARDED
   // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size
@@ -668,9 +669,24 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
   }
   const size_t reserved    = (os_align ? 1 : (page_noguard_size - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
+
+  // commit first block?
+  size_t commit_size = 0;
+  if (!memid.initially_committed) {
+    commit_size = _mi_align_up(block_start + block_size, MI_PAGE_MIN_COMMIT_SIZE);
+    if (commit_size > page_noguard_size) { commit_size = page_noguard_size; }
+    bool is_zero;
+    _mi_os_commit(page, commit_size, &is_zero);
+    if (!memid.initially_zero && !is_zero) {
+      _mi_memzero_aligned(page, commit_size);
+    }
+  }
+
+  // initialize
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
+  page->page_committed = (commit_size == 0 ? 0 : commit_size - block_start);  mi_assert(commit_size == 0 || commit_size >= block_start + block_size);
   page->memid = memid;
   page->free_is_zero = memid.initially_zero;
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
@@ -704,7 +720,8 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq);
+  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, 
+                                    !mi_option_is_enabled(mi_option_page_commit_on_demand));
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -726,7 +743,7 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s
   const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
   #endif
 
-  mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
+  mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, true /* commit singletons always */);
   if (page == NULL) return NULL;
 
   mi_assert(page->reserved == 1);
@@ -779,7 +796,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
     mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
     // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
@@ -799,7 +816,16 @@ void _mi_arenas_page_free(mi_page_t* page) {
   // unregister page
   _mi_page_map_unregister(page);
   if (page->memid.memkind == MI_MEM_ARENA) {
-    mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
+    mi_arena_t* arena = page->memid.mem.arena.arena;
+    mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index);
+    if (page->page_committed > 0) {
+      // if committed on-demand, set the commit bits to account commit properly
+      const size_t total_committed = (page->page_start - (uint8_t*)page) + page->page_committed;
+      mi_assert_internal(mi_memid_size(page->memid) >= total_committed);
+      const size_t total_slices = _mi_divide_up(total_committed, MI_ARENA_SLICE_SIZE);
+      mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices);
+      mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL);
+    }
   }
   _mi_arenas_free(page, mi_memid_size(page->memid), page->memid);
 }
@@ -824,7 +850,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
     mi_assert_internal(!mi_page_is_singleton(page));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
@@ -889,7 +915,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
@@ -1430,7 +1456,7 @@ static long mi_arena_purge_delay(void) {
 // returns if the memory is no longer committed (versus reset which keeps the commit)
 static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
   mi_assert_internal(!arena->memid.is_pinned);
-  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); // we own it?
 
   const size_t size = mi_size_of_slices(slice_count);
   void* const p = mi_arena_slice_start(arena, slice_index);
@@ -1455,7 +1481,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
   const long delay = mi_arena_purge_delay();
   if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
 
-  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); // we still own it?
   if (delay == 0) {
     // purge directly
     mi_arena_purge(arena, slice_index, slice_count);
diff --git a/src/init.c b/src/init.c
index 5240611c..16c1dea4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -35,6 +35,7 @@ const mi_page_t _mi_page_empty = {
   #endif
   NULL,                   // xheap
   NULL, NULL,             // next, prev
+  MI_ARENA_SLICE_SIZE,    // page_committed
   MI_MEMID_STATIC         // memid
 };
 
diff --git a/src/options.c b/src/options.c
index 63d8a68f..faeb9da4 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 500, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 250, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -175,6 +175,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page.c b/src/page.c
index 239d5d6e..ed94cae1 100644
--- a/src/page.c
+++ b/src/page.c
@@ -606,6 +606,18 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
   mi_assert_internal(extend < (1UL<<16));
 
+  // commit on demand?
+  if (page->page_committed > 0) {
+    const size_t needed_size = (page->capacity + extend)*bsize;
+    if (needed_size > page->page_committed) {
+      size_t commit_size = _mi_align_up(needed_size, MI_PAGE_MIN_COMMIT_SIZE);
+      const size_t max_size = page->reserved * bsize;
+      if (commit_size > max_size) { commit_size = max_size; }
+      mi_assert(commit_size > page->page_committed);
+      _mi_os_commit(mi_page_start(page) + page->page_committed, commit_size - page->page_committed, NULL);
+    }
+  }
+
   // and append the extend the free list
   if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
     mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
@@ -635,8 +647,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   #endif
   #if MI_DEBUG>2
   if (page->memid.initially_zero) {
-    mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
+    mi_track_mem_defined(page->page_start, (page->page_committed == 0 ? page_size : page->page_committed));
+    mi_assert_expensive(mi_mem_is_zero(page_start, (page->page_committed == 0 ? page_size : page->page_committed)));
   }
   #endif
 

From d21114b5f2904aaefd8d97871e938e5ef839d942 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 11:37:52 -0800
Subject: [PATCH 149/264] improve page commit on demand

---
 include/mimalloc/internal.h | 25 +++++++++++--
 include/mimalloc/types.h    |  3 +-
 src/arena.c                 | 75 +++++++++++++++++++++++--------------
 src/heap.c                  |  5 ++-
 src/options.c               |  4 +-
 src/os.c                    | 20 ++++++----
 src/page.c                  | 25 +++++++------
 7 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 7c49d590..5b877635 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -128,7 +128,8 @@ bool          _mi_os_decommit(void* addr, size_t size);
 bool          _mi_os_protect(void* addr, size_t size);
 bool          _mi_os_unprotect(void* addr, size_t size);
 bool          _mi_os_purge(void* p, size_t size);
-bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset);
+bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size);
+bool          _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
 
 size_t        _mi_os_secure_guard_page_size(void);
 bool          _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned);
@@ -155,7 +156,7 @@ void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit,
 void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
 bool          _mi_arenas_contain(const void* p);
-void          _mi_arenas_collect(bool force_purge, mi_tld_t* tld);
+void          _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld);
 void          _mi_arenas_unsafe_destroy_all(mi_tld_t* tld);
 
 mi_page_t*    _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
@@ -534,9 +535,12 @@ static inline uint8_t* mi_page_start(const mi_page_t* page) {
   return page->page_start;
 }
 
+static inline size_t mi_page_size(const mi_page_t* page) {
+  return mi_page_block_size(page) * page->reserved;
+}
 
 static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
-  if (size) { *size = mi_page_block_size(page) * page->reserved; }
+  if (size) { *size = mi_page_size(page); }
   return mi_page_start(page);
 }
 
@@ -564,6 +568,21 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
+// This may change if we locate page info outside the page data slices
+static inline uint8_t* mi_page_slice_start(const mi_page_t* page) {
+  return (uint8_t*)page;
+}
+
+// This gives the offset relative to the start slice of a page. This may change if we ever
+// locate page info outside the page-data itself.
+static inline size_t mi_page_slice_offset_of(const mi_page_t* page, size_t offset_relative_to_page_start) {
+  return (page->page_start - mi_page_slice_start(page)) + offset_relative_to_page_start;
+}
+
+static inline size_t mi_page_committed(const mi_page_t* page) {
+  return (page->slice_committed == 0 ? mi_page_size(page) : page->slice_committed - (page->page_start - mi_page_slice_start(page)));
+}
+
 static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
   return page->heap;
 }
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a4e158d6..627aa6f9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -139,6 +139,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 
+// Minimal commit for a page on-demand commit (should be >= OS page size, and >= MI_ARENA_SLICE_SIZE for correct stats)
 #define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE
 
 // ------------------------------------------------------
@@ -303,7 +304,7 @@ typedef struct mi_page_s {
   mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
-  size_t                    page_committed;    // committed size relative to `page_start`. 
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data
   mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
diff --git a/src/arena.c b/src/arena.c
index c31f1fe3..a5b83bf5 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -207,12 +207,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       size_t already_committed_count = 0;
       mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
       // adjust the stats so we don't double count the commits
-      if (already_committed_count > 0) {
-        mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
-      }
+      //if (already_committed_count > 0) {
+      //  mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
+      //}
       // now actually commit
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
+      if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed_count))) {
         // failed to commit (todo: give warning?)
         if (already_committed_count > 0) {
           mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count));
@@ -686,7 +686,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
   page->reserved = (uint16_t)reserved;
   page->page_start = (uint8_t*)page + block_start;
   page->block_size = block_size;
-  page->page_committed = (commit_size == 0 ? 0 : commit_size - block_start);  mi_assert(commit_size == 0 || commit_size >= block_start + block_size);
+  page->slice_committed = commit_size;
   page->memid = memid;
   page->free_is_zero = memid.initially_zero;
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
@@ -720,8 +720,10 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, 
-                                    !mi_option_is_enabled(mi_option_page_commit_on_demand));
+  const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
+                       _mi_os_has_overcommit() || // no need to commit on demand on an OS that already does this for us
+                       !mi_option_is_enabled(mi_option_page_commit_on_demand));
+  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -818,13 +820,18 @@ void _mi_arenas_page_free(mi_page_t* page) {
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_arena_t* arena = page->memid.mem.arena.arena;
     mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index);
-    if (page->page_committed > 0) {
+    if (page->slice_committed > 0) {
       // if committed on-demand, set the commit bits to account commit properly
-      const size_t total_committed = (page->page_start - (uint8_t*)page) + page->page_committed;
-      mi_assert_internal(mi_memid_size(page->memid) >= total_committed);
-      const size_t total_slices = _mi_divide_up(total_committed, MI_ARENA_SLICE_SIZE);
+      mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed);
+      const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
+      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices));
       mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices);
-      mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL);
+      if (total_slices > 0) {
+        mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL);
+      }
+    }
+    else {
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count));
     }
   }
   _mi_arenas_free(page, mi_memid_size(page->memid), page->memid);
@@ -1005,8 +1012,8 @@ void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_tld_t* tld) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, tld);
+void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld) {
+  mi_arenas_try_purge(force_purge, visit_all, tld);
 }
 
 
@@ -1062,7 +1069,7 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld) {
   mi_arenas_unsafe_destroy(_mi_subproc());
-  _mi_arenas_collect(true /* force purge */, tld);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */, true /* visit all*/, tld);  // purge non-owned arenas
 }
 
 
@@ -1462,15 +1469,23 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
   void* const p = mi_arena_slice_start(arena, slice_index);
   //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
   size_t already_committed;
-  mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed);
+  mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); // pretend all committed.. (as we lack a clearN call that counts the already set bits..)
   const bool all_committed = (already_committed == slice_count);
-  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */);
+  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */, mi_size_of_slices(already_committed));
 
-  // update committed bitmap
   if (needs_recommit) {
-    mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
+    // no longer committed
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we just counted in the purge to decommit all, but the some part was not committed so adjust that here
+    // mi_os_stat_decrease(committed, mi_size_of_slices(slice_count - already_committed));
   }
+  else if (!all_committed) {
+    // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed)
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we adjust the commit count as parts will be re-committed 
+    // mi_os_stat_decrease(committed, mi_size_of_slices(already_committed));
+  }
+
   return needs_recommit;
 }
 
@@ -1493,6 +1508,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
     if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) {
       // expiration was not yet set
       // maybe set the global arenas expire as well (if it wasn't set already)
+      mi_assert_internal(expire0==0);
       mi_atomic_casi64_strong_acq_rel(&arena->subproc->purge_expire, &expire0, expire);
     }
     else {
@@ -1554,8 +1570,8 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
   if (!force && (expire == 0 || expire > now)) return false;
 
-  // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
+  // reset expire 
+  mi_atomic_store_release(&arena->purge_expire, (mi_msecs_t)0);
   mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
 
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
@@ -1570,33 +1586,36 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 
 static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
 {
+  // try purge can be called often so try to only run when needed
   const long delay = mi_arena_purge_delay();
   if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
 
   // check if any arena needs purging?
   mi_subproc_t* subproc = tld->subproc;
   const mi_msecs_t now = _mi_clock_now();
-  mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire);
-  if (!force && (arenas_expire == 0 || arenas_expire > now)) return;
+  const mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire);
+  if (!visit_all && !force && (arenas_expire == 0 || arenas_expire > now)) return;
 
   const size_t max_arena = mi_arenas_get_count(subproc);
   if (max_arena == 0) return;
 
-  // allow only one thread to purge at a time
+  // allow only one thread to purge at a time (todo: allow concurrent purging?)
   static mi_atomic_guard_t purge_guard;
   mi_atomic_guard(&purge_guard)
   {
     // increase global expire: at most one purge per delay cycle
-    mi_atomic_store_release(&subproc->purge_expire, now + delay);
+    if (arenas_expire > now) { mi_atomic_store_release(&subproc->purge_expire, now + (delay/10)); }
     const size_t arena_start = tld->thread_seq % max_arena;
-    size_t max_purge_count = (visit_all ? max_arena : 2);
+    size_t max_purge_count = (visit_all ? max_arena : (max_arena/4)+1);
     bool all_visited = true;
+    bool any_purged = false;
     for (size_t _i = 0; _i < max_arena; _i++) {
       size_t i = _i + arena_start;
       if (i >= max_arena) { i -= max_arena; }
       mi_arena_t* arena = mi_arena_from_index(subproc,i);
       if (arena != NULL) {
         if (mi_arena_try_purge(arena, now, force)) {
+          any_purged = true;
           if (max_purge_count <= 1) {
             all_visited = false;
             break;
@@ -1605,8 +1624,8 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
         }
       }
     }
-    if (all_visited) {
-      mi_atomic_store_release(&subproc->purge_expire, (mi_msecs_t)0);
+    if (all_visited && !any_purged) {
+      mi_atomic_store_release(&subproc->purge_expire, 0);
     }
   }
 }
diff --git a/src/heap.c b/src/heap.c
index 6632861b..f0d495a3 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -119,8 +119,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
 
-  // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, heap->tld);
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)  
+  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); 
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
diff --git a/src/options.c b/src/options.c
index faeb9da4..b613f983 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 250, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },
+  { 1,   UNINIT, MI_OPTION(page_commit_on_demand) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/os.c b/src/os.c
index 399aac6c..79c2bc17 100644
--- a/src/os.c
+++ b/src/os.c
@@ -429,9 +429,9 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
+bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) {
   if (is_zero != NULL) { *is_zero = false; }
-  mi_os_stat_increase(committed, size);  // use size for precise commit vs. decommit
+  mi_os_stat_increase(committed, stat_size);  // use size for precise commit vs. decommit
   mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
@@ -458,9 +458,13 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
   return true;
 }
 
-static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
+  return _mi_os_commit_ex(addr, size, is_zero, size);
+}
+
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stats_size) {
   mi_assert_internal(needs_recommit!=NULL);
-  mi_os_stat_decrease(committed, size);
+  mi_os_stat_decrease(committed, stats_size);
 
   // page align
   size_t csize;
@@ -479,7 +483,7 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
 
 bool _mi_os_decommit(void* addr, size_t size) {
   bool needs_recommit;
-  return mi_os_decommit_ex(addr, size, &needs_recommit);
+  return mi_os_decommit_ex(addr, size, &needs_recommit, size);
 }
 
 
@@ -509,7 +513,7 @@ bool _mi_os_reset(void* addr, size_t size) {
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
   mi_os_stat_counter_increase(purge_calls, 1);
@@ -519,7 +523,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
     !_mi_preloading())                                     // don't decommit during preloading (unsafe)
   {
     bool needs_recommit = true;
-    mi_os_decommit_ex(p, size, &needs_recommit);
+    mi_os_decommit_ex(p, size, &needs_recommit, stats_size);
     return needs_recommit;
   }
   else {
@@ -533,7 +537,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge(void* p, size_t size) {
-  return _mi_os_purge_ex(p, size, true);
+  return _mi_os_purge_ex(p, size, true, size);
 }
 
 
diff --git a/src/page.c b/src/page.c
index ed94cae1..aba548e9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -251,8 +251,10 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   }
   else {
     mi_page_queue_remove(pq, page);
+    mi_tld_t* tld = page->heap->tld;
     mi_page_set_heap(page, NULL);
-    _mi_arenas_page_abandon(page);    
+    _mi_arenas_page_abandon(page);  
+    _mi_arenas_collect(false, false, tld); // allow purging
   }
 }
 
@@ -263,7 +265,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(pq != NULL);
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
   mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
-  #endif
+  #endif  
   mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // out-of-memory
@@ -359,7 +361,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
   mi_heap_t* heap = page->heap;
   mi_page_set_heap(page,NULL);
   _mi_arenas_page_free(page);
-  _mi_arenas_collect(false, heap->tld);  // allow purging
+  _mi_arenas_collect(false, false, heap->tld);  // allow purging
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@@ -607,14 +609,13 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_internal(extend < (1UL<<16));
 
   // commit on demand?
-  if (page->page_committed > 0) {
+  if (page->slice_committed > 0) {
     const size_t needed_size = (page->capacity + extend)*bsize;
-    if (needed_size > page->page_committed) {
-      size_t commit_size = _mi_align_up(needed_size, MI_PAGE_MIN_COMMIT_SIZE);
-      const size_t max_size = page->reserved * bsize;
-      if (commit_size > max_size) { commit_size = max_size; }
-      mi_assert(commit_size > page->page_committed);
-      _mi_os_commit(mi_page_start(page) + page->page_committed, commit_size - page->page_committed, NULL);
+    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
+    if (needed_commit > page->slice_committed) {      
+      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
+      _mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL);
+      page->slice_committed = needed_commit;
     }
   }
 
@@ -647,8 +648,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   #endif
   #if MI_DEBUG>2
   if (page->memid.initially_zero) {
-    mi_track_mem_defined(page->page_start, (page->page_committed == 0 ? page_size : page->page_committed));
-    mi_assert_expensive(mi_mem_is_zero(page_start, (page->page_committed == 0 ? page_size : page->page_committed)));
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
   }
   #endif
 

From 71a1645d4d06fc5c7c1b91b7df6d94ff956c647e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 24 Dec 2024 12:04:21 -0800
Subject: [PATCH 150/264] fix build

---
 src/arena.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index 9915cdcf..48fa0315 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -563,7 +563,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
       mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
       mi_assert_internal(_mi_ptr_page(page)==page);

From 016b36d9173cc7adf51c4f3836bc1e22682e1837 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 24 Dec 2024 12:10:34 -0800
Subject: [PATCH 151/264] fix max va bits on unix

---
 src/os.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/os.c b/src/os.c
index 79c2bc17..ef440fcd 100644
--- a/src/os.c
+++ b/src/os.c
@@ -15,14 +15,6 @@ terms of the MIT license. A copy of the license can be found in the file
 /* -----------------------------------------------------------
   Initialization.
 ----------------------------------------------------------- */
-#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
-#if MI_INTPTR_SIZE < 8
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   32
-#else
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   48
-#endif
-#endif
-
 #ifndef MI_DEFAULT_PHYSICAL_MEMORY
 #if MI_INTPTR_SIZE < 8
 #define MI_DEFAULT_PHYSICAL_MEMORY    4*MI_GiB
@@ -36,7 +28,7 @@ static mi_os_mem_config_t mi_os_mem_config = {
   0,        // large page size (usually 2MiB)
   4096,     // allocation granularity
   MI_DEFAULT_PHYSICAL_MEMORY,
-  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  MI_MAX_VABITS,  // in `bits.h`
   true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
   false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
   true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)

From ad6f48f3e4b85d0f8a0f3de1a4ba2aeb9db8adb5 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 15:00:05 -0800
Subject: [PATCH 152/264] fix assertion for huge pages

---
 src/page-queue.c | 4 ++--
 src/page.c       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/page-queue.c b/src/page-queue.c
index 9e3aaacc..128ae8e3 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -156,7 +156,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t*
 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
   mi_heap_t* heap = mi_page_heap(page);
   mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -210,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
-  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(queue, page));
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
                       (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
diff --git a/src/page.c b/src/page.c
index aba548e9..1e15644e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -123,7 +123,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
     //mi_assert_internal(!_mi_process_is_initialized);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
-      mi_assert_internal(mi_page_queue_contains(pq, page));
+      mi_assert_internal(mi_page_is_huge(page) || mi_page_queue_contains(pq, page));
       mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
       // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
@@ -298,7 +298,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
+  mi_assert_internal(mi_page_is_huge(page) || pq==mi_heap_page_queue_of(heap, page));
   return page;
 }
 

From d862e57955e7f00d16024b9780e43bb2e964eeae Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 16:39:54 -0800
Subject: [PATCH 153/264] fix huge page allocation size

---
 src/page-queue.c |  4 ++--
 src/page.c       | 29 ++++++++++++++++-------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/page-queue.c b/src/page-queue.c
index 128ae8e3..9e3aaacc 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -156,7 +156,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t*
 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
   mi_heap_t* heap = mi_page_heap(page);
   mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(pq, page));
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -210,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
-  mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(queue, page));
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
                       (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
diff --git a/src/page.c b/src/page.c
index 1e15644e..9a96da85 100644
--- a/src/page.c
+++ b/src/page.c
@@ -123,7 +123,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
     //mi_assert_internal(!_mi_process_is_initialized);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
-      mi_assert_internal(mi_page_is_huge(page) || mi_page_queue_contains(pq, page));
+      mi_assert_internal(mi_page_queue_contains(pq, page));
       mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
       // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
@@ -298,7 +298,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(mi_page_is_huge(page) || pq==mi_heap_page_queue_of(heap, page));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
   return page;
 }
 
@@ -794,8 +794,9 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
 
 
 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap, size);
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
+  // mi_page_queue_t* pq = mi_page_queue(heap, size);
+  mi_assert_internal(!mi_page_queue_is_huge(pq));
 
   // check the first page: we even do this with candidate search or otherwise we re-search every time
   mi_page_t* page = pq->first;
@@ -853,13 +854,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
-  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
+  const size_t block_size = _mi_os_good_alloc_size(size);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
   #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = NULL;
+  #error todo.
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
+  // mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
   mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@@ -882,15 +883,17 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
+  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
   // huge allocation?
-  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(mi_page_queue_is_huge(pq)) {
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+    //if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
     }
     else {
-      return mi_huge_page_alloc(heap,size,huge_alignment);
+      return mi_huge_page_alloc(heap,size,huge_alignment,pq);
     }
   }
   else {
@@ -898,7 +901,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
     #if MI_PADDING
     mi_assert_internal(size >= MI_PADDING_SIZE);
     #endif
-    return mi_find_free_page(heap, size);
+    return mi_find_free_page(heap, pq);
   }
 }
 

From 1e1a12bf3c4194ee121776aa3d383b218442c2a2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 17:07:11 -0800
Subject: [PATCH 154/264] fix rounding issue with huge size allocations

---
 include/mimalloc/internal.h |  5 +++--
 include/mimalloc/types.h    |  2 +-
 src/page.c                  | 19 ++++++++-----------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 5b877635..0e161951 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -706,9 +706,10 @@ static inline bool mi_page_is_huge(const mi_page_t* page) {
           (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
 }
 
-
 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
-  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+  mi_page_queue_t* const pq = &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) { mi_assert_internal(pq->block_size <= MI_LARGE_MAX_OBJ_SIZE); } 
+  return pq;  
 }
 
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 627aa6f9..4bede252 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -328,7 +328,7 @@ typedef struct mi_page_s {
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 512 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)   // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with _mi_bin 
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/page.c b/src/page.c
index 9a96da85..542496a0 100644
--- a/src/page.c
+++ b/src/page.c
@@ -883,18 +883,15 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
-  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
+  }
+  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
   // huge allocation?
-  if mi_unlikely(mi_page_queue_is_huge(pq)) {
-    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-    //if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) {
-    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      return NULL;
-    }
-    else {
-      return mi_huge_page_alloc(heap,size,huge_alignment,pq);
-    }
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
+    return mi_huge_page_alloc(heap,size,huge_alignment,pq);
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues

From 4d1d3471cff8e7285705fe590d46dcfe51e22d0c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 17:14:53 -0800
Subject: [PATCH 155/264] rename page options

---
 include/mimalloc.h | 4 ++--
 src/heap.c         | 4 ++--
 src/init.c         | 4 ++--
 src/options.c      | 4 ++--
 src/page.c         | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 5f856411..6432e41a 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -396,8 +396,8 @@ typedef enum mi_option_e {
   mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
   mi_option_target_segments_per_thread, // experimental (=0)
   mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
-  mi_option_full_page_retain,           // retain N full pages per size class (=2)
-  mi_option_max_page_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_page_full_retain,           // retain N full pages per size class (=2)
+  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
   mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
   mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
   mi_option_page_commit_on_demand,      // commit page memory on-demand
diff --git a/src/heap.c b/src/heap.c
index f0d495a3..09cc2574 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -170,8 +170,8 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
   heap->exclusive_arena    = _mi_arena_from_id(arena_id);
   heap->allow_page_reclaim = !noreclaim;
-  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
-  heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0);
+  heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
diff --git a/src/init.c b/src/init.c
index 16c1dea4..4631d9d9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -254,8 +254,8 @@ static void mi_heap_main_init(void) {
     //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
     //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
-    heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
-    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+    heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
+    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   }
 }
 
diff --git a/src/options.c b/src/options.c
index b613f983..0d9bea28 100644
--- a/src/options.c
+++ b/src/options.c
@@ -170,8 +170,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
   { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 2,   UNINIT, MI_OPTION(full_page_retain) },
-  { 4,   UNINIT, MI_OPTION(max_page_candidates) },
+  { 2,   UNINIT, MI_OPTION(page_full_retain) },
+  { 4,   UNINIT, MI_OPTION(page_max_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
diff --git a/src/page.c b/src/page.c
index 542496a0..474d8d2d 100644
--- a/src/page.c
+++ b/src/page.c
@@ -721,7 +721,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
       // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
       if (page_candidate == NULL) {
         page_candidate = page;
-        candidate_limit = _mi_option_get_fast(mi_option_max_page_candidates);
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
       }
       else if (mi_page_all_free(page_candidate)) {
         _mi_page_free(page_candidate, pq);

From 8259c0eb7ca96787a50bcbad24d28b5bb2407acd Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 20:10:44 -0800
Subject: [PATCH 156/264] nice colors for heap maps

---
 include/mimalloc.h |   2 +-
 src/arena.c        | 119 ++++++++++++++++++++++++++++++++-------------
 src/libc.c         |  18 ++++++-
 src/options.c      |   4 +-
 test/test-stress.c |  14 +++---
 5 files changed, 111 insertions(+), 46 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 6432e41a..dacc647e 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef void* mi_arena_id_t;
diff --git a/src/arena.c b/src/arena.c
index a5b83bf5..083fc35b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -720,9 +720,9 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
   }
 
   // 2. find a free block, potentially allocating a new arena
+  const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
   const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
-                       _mi_os_has_overcommit() || // no need to commit on demand on an OS that already does this for us
-                       !mi_option_is_enabled(mi_option_page_commit_on_demand));
+                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 1));
   page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
@@ -824,7 +824,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
       // if committed on-demand, set the commit bits to account commit properly
       mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed);
       const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
-      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices));
+      //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices));
       mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices);
       if (total_slices > 0) {
         mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL);
@@ -1262,56 +1262,106 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe
 /* -----------------------------------------------------------
   Debugging
 ----------------------------------------------------------- */
-static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf, size_t* k) {
   size_t bit_set_count = 0;
   for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {
     bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
     if (is_set) bit_set_count++;
-    buf[bit] = (is_set ? 'x' : '.');
+    buf[*k++] = (is_set ? 'x' : '.');
   }
   return bit_set_count;
 }
 
-static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t* arena, size_t slice_index) {
+typedef enum mi_ansi_color_e {
+  MI_BLACK = 30,
+  MI_MAROON,
+  MI_DARKGREEN,
+  MI_ORANGE,
+  MI_NAVY,
+  MI_PURPLE,
+  MI_TEAL,
+  MI_GRAY,
+  MI_DARKGRAY = 90,
+  MI_RED,
+  MI_GREEN,
+  MI_YELLOW,
+  MI_BLUE,
+  MI_MAGENTA,
+  MI_CYAN,
+  MI_WHITE
+} mi_ansi_color_t;
+
+static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
+  buf[*k] = '\x1b'; 
+  buf[*k+1] = '[';
+  buf[*k+2] = (char)(((int)color / 10) + '0');
+  buf[*k+3] = (char)(((int)color % 10) + '0');
+  buf[*k+4] = 'm';
+  *k += 5;
+}
+
+static int mi_page_commit_usage(mi_page_t* page) {
+  if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100;
+  const size_t committed_size = mi_page_committed(page);
+  const size_t used_size = page->used * mi_page_block_size(page);
+  return (int)(used_size * 100 / committed_size);
+}
+
+static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index) {
   size_t bit_set_count = 0;
   long bit_of_page = 0;
+  mi_ansi_color_t color = MI_GRAY;
+  mi_ansi_color_t prev_color = MI_GRAY;
   for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) {
     bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
     void* start = mi_arena_slice_start(arena, slice_index + bit);
+    char c = ' ';
     if (is_set) {
       mi_assert_internal(bit_of_page <= 0);
       bit_set_count++;
       mi_page_t* page = (mi_page_t*)start;
-      char c = 'p';
+      c = 'p';
+      color = MI_GRAY;
       if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
       else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); }
+      int commit_usage = mi_page_commit_usage(page);
+      if (commit_usage < 25) { color = MI_MAROON; }
+      else if (commit_usage < 50) { color = MI_ORANGE; }
+      else if (commit_usage < 75) { color = MI_TEAL; }
+      else color = MI_DARKGREEN;
       bit_of_page = (long)page->memid.mem.arena.slice_count;
-      buf[bit] = c;
     }
     else {
-      char c = '?';
+      c = '?';
       if (bit_of_page > 0) { c = '-'; }
-      else if (_mi_meta_is_meta_page(start)) { c = 'm'; }
-      else if (slice_index + bit < arena->info_slices) { c = 'i'; }
+      else if (_mi_meta_is_meta_page(start)) { c = 'm'; color = MI_GRAY; }
+      else if (slice_index + bit < arena->info_slices) { c = 'i'; color = MI_GRAY; }
       // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
       else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) {
-        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; }
-        else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; }
-        else { c = '.'; }
+        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; color = MI_ORANGE; }
+        else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; color = MI_GRAY; }
+        else { c = '.'; color = MI_GRAY; }
       }
-      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }
-      buf[bit] = c;
+      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }      
     }
+    if (color != prev_color) {
+      mi_debug_color(buf, k, color);
+      prev_color = color;
+    }
+    buf[*k] = c; *k += 1;
   }
+  mi_debug_color(buf, k, MI_GRAY);
   return bit_set_count;
 }
 
+#define MI_FIELDS_PER_LINE  (4)
+
 static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
-  _mi_output_message("%s:\n", header);
+  _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header);  
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
-    char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
+    char buf[10*MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     size_t k = 0;
     mi_bchunk_t* chunk = &bitmap->chunks[i];
 
@@ -1320,17 +1370,18 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
     else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
 
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
-      if (j > 0 && (j % 4) == 0) {
-        buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
+      if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) {
+        _mi_output_message("  %s\n\x1B[37m", buf);
+        _mi_memzero(buf, sizeof(buf));
+        k = 0; buf[k++] = ' '; buf[k++] = ' ';  buf[k++] = ' ';
       }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
-        size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf + k, arena, bit_count)
-                                     : mi_debug_show_bfield(bfield, buf + k));
+        size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count)
+                                     : mi_debug_show_bfield(bfield, buf, &k));
         if (invert) xcount = MI_BFIELD_BITS - xcount;
         bit_set_count += xcount;
-        k += MI_BFIELD_BITS;
         buf[k++] = ' ';
       }
       else {
@@ -1339,16 +1390,16 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
       }
       bit_count += MI_BFIELD_BITS;
     }
-    _mi_output_message("  %s\n", buf);
+    _mi_output_message("  %s\n\x1B[37m", buf);
   }
-  _mi_output_message("  total ('x'): %zu\n", bit_set_count);
+  _mi_output_message("\x1B[0m  total ('x'): %zu\n", bit_set_count);
   return bit_set_count;
 }
 
-void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept {
+void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept {
   mi_subproc_t* subproc = _mi_subproc();
   size_t max_arenas = mi_arenas_get_count(subproc);
-  size_t free_total = 0;
+  //size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
   size_t page_total = 0;
@@ -1358,12 +1409,12 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     mi_assert(arena->subproc == subproc);
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc);
-    if (show_inuse) {
-      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
-    }
-    if (show_committed) {
-      mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
-    }
+    //if (show_inuse) {
+    //  free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
+    //}
+    //if (show_committed) {
+    //  mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
+    //}
     // todo: abandoned slices
     //if (show_purge) {
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
@@ -1372,7 +1423,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
       page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena);
     }
   }
-  if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
+  // if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
   // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total);
   if (show_pages)     _mi_output_message("total pages in arenas: %zu\n", page_total);
 }
diff --git a/src/libc.c b/src/libc.c
index 0ec2164d..a0eeca17 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -171,7 +171,18 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
     char c;
     MI_NEXTC();
     if (c != '%') {
-      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+      if (c == '\\') {
+        MI_NEXTC();
+        switch (c) {
+        case 'e': mi_outc('\x1B', &out, end); break;
+        case 't': mi_outc('\t', &out, end); break;
+        case 'n': mi_outc('\n', &out, end); break;
+        case 'r': mi_outc('\r', &out, end); break;
+        case '\\': mi_outc('\\', &out, end); break;
+        default: /* ignore */ break;
+        }
+      }
+      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
         mi_outc(c, &out, end);
       }
     }
@@ -199,7 +210,10 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       }
 
       char* start = out;
-      if (c == 's') {
+      if (c == '%') {
+        mi_outc('%', &out, end);
+      }
+      else if (c == 's') {
         // string
         const char* s = va_arg(args, const char*);
         mi_outs(s, &out, end);
diff --git a/src/options.c b/src/options.c
index 0d9bea28..0d51cc00 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 1,   UNINIT, MI_OPTION(page_commit_on_demand) },
+  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/test/test-stress.c b/test/test-stress.c
index bbcded65..527d6dce 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -260,9 +260,9 @@ static void test_stress(void) {
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { 
       printf("- iterations left: %3d\n", ITER - (n + 1)); 
-      mi_debug_show_arenas(true, false, false);
+      mi_debug_show_arenas(true);
       //mi_collect(true);
-      //mi_debug_show_arenas(true, false, false);
+      //mi_debug_show_arenas(true);
     }
     #endif
   }
@@ -346,13 +346,13 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  //mi_debug_show_arenas(true, true, false);
-  mi_debug_show_arenas(true, false, false);
-  mi_collect(true);
-  mi_debug_show_arenas(true,false,false);
+  //mi_debug_show_arenas(true);
+  mi_debug_show_arenas(true);
+  //mi_collect(true);
+  //mi_debug_show_arenas(true);
   #else
   //mi_collect(true);
-  mi_debug_show_arenas(true,false,false);
+  mi_debug_show_arenas(true);
   mi_stats_print(NULL);
   #endif
 #else

From 24b8384f80b62d4382b504c80c9b62a5fa9b91cf Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 20:23:37 -0800
Subject: [PATCH 157/264] remove is_expandable requirement on page candidates

---
 src/options.c | 2 +-
 src/page.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/options.c b/src/options.c
index 0d51cc00..0a9a5f92 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
diff --git a/src/page.c b/src/page.c
index 474d8d2d..2f0ec406 100644
--- a/src/page.c
+++ b/src/page.c
@@ -728,7 +728,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         page_candidate = page;
       }
       // prefer to reuse fuller pages (in the hope the less used page gets freed)
-      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate

From 5a663da9aaca48e90ced03d832f387ad42e976bc Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 24 Dec 2024 20:38:36 -0800
Subject: [PATCH 158/264] fix build warning

---
 src/arena.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 083fc35b..bbc0907e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1400,14 +1400,14 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept {
   mi_subproc_t* subproc = _mi_subproc();
   size_t max_arenas = mi_arenas_get_count(subproc);
   //size_t free_total = 0;
-  size_t slice_total = 0;
+  //size_t slice_total = 0;
   //size_t abandoned_total = 0;
   size_t page_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena == NULL) break;
     mi_assert(arena->subproc == subproc);
-    slice_total += arena->slice_count;
+    // slice_total += arena->slice_count;
     _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc);
     //if (show_inuse) {
     //  free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);

From ce7eb4db7a746aba77a35fa332d9b01f23430b9b Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 25 Dec 2024 10:49:49 -0800
Subject: [PATCH 159/264] fix page commit-on-demand setting

---
 src/arena.c | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bbc0907e..bd1c3e70 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -213,10 +213,6 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       // now actually commit
       bool commit_zero = false;
       if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed_count))) {
-        // failed to commit (todo: give warning?)
-        if (already_committed_count > 0) {
-          mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count));
-        }
         memid->initially_committed = false;
       }
       else {
@@ -308,7 +304,9 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
   // is actually allocated for the first time it will be counted.
   const bool adjust = (overcommit && arena_commit);
-  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); }
+  if (adjust) { 
+    mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); 
+  }
   // and try to reserve the arena
   int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
@@ -562,7 +560,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-      mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
       mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
       mi_assert_internal(_mi_ptr_page(page)==page);
@@ -722,7 +720,7 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
   // 2. find a free block, potentially allocating a new arena
   const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
   const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
-                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 1));
+                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0));
   page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
@@ -798,7 +796,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1));
     mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
     // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
@@ -857,7 +855,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
     mi_assert_internal(!mi_page_is_singleton(page));
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
 
     mi_page_set_abandoned_mapped(page);
@@ -922,7 +920,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) {
     mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count);
 
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-    mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
@@ -1161,9 +1159,9 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
 
   mi_arena_t* arena = (mi_arena_t*)start;
 
-  // commit & zero if needed  
+  // commit & zero if needed
   if (!memid.initially_committed) {
-    // leave a guard OS page decommitted at the end 
+    // leave a guard OS page decommitted at the end
     _mi_os_commit(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size(), NULL);
   }
   else {
@@ -1180,7 +1178,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
   arena->is_exclusive = exclusive;
   arena->slice_count  = slice_count;
   arena->info_slices  = info_slices;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)  
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->purge_expire = 0;
   // mi_lock_init(&arena->abandoned_visit_lock);
 
@@ -1292,7 +1290,7 @@ typedef enum mi_ansi_color_e {
 } mi_ansi_color_t;
 
 static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
-  buf[*k] = '\x1b'; 
+  buf[*k] = '\x1b';
   buf[*k+1] = '[';
   buf[*k+2] = (char)(((int)color / 10) + '0');
   buf[*k+3] = (char)(((int)color % 10) + '0');
@@ -1342,7 +1340,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
         else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; color = MI_GRAY; }
         else { c = '.'; color = MI_GRAY; }
       }
-      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }      
+      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }
     }
     if (color != prev_color) {
       mi_debug_color(buf, k, color);
@@ -1357,7 +1355,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
 #define MI_FIELDS_PER_LINE  (4)
 
 static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
-  _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header);  
+  _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
@@ -1506,7 +1504,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 
 static long mi_arena_purge_delay(void) {
   // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));  
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
 }
 
 // reset or decommit in an arena and update the commit bitmap
@@ -1533,7 +1531,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
   else if (!all_committed) {
     // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed)
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
-    // we adjust the commit count as parts will be re-committed 
+    // we adjust the commit count as parts will be re-committed
     // mi_os_stat_decrease(committed, mi_size_of_slices(already_committed));
   }
 
@@ -1621,7 +1619,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
   if (!force && (expire == 0 || expire > now)) return false;
 
-  // reset expire 
+  // reset expire
   mi_atomic_store_release(&arena->purge_expire, (mi_msecs_t)0);
   mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
 
@@ -1696,8 +1694,8 @@ static bool abandoned_page_visit(mi_page_t* page, mi_abandoned_page_visit_info_t
   if (page->heap_tag != vinfo->heap_tag) { return true; } // continue
   mi_heap_area_t area;
   _mi_heap_area_init(&area, page);
-  if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { 
-    return false; 
+  if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) {
+    return false;
   }
   if (vinfo->visit_blocks) {
     return _mi_heap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg);
@@ -1712,7 +1710,7 @@ static bool abandoned_page_visit_at(size_t slice_index, size_t slice_count, mi_a
   mi_abandoned_page_visit_info_t* vinfo = (mi_abandoned_page_visit_info_t*)arg;
   mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   mi_assert_internal(mi_page_is_abandoned_mapped(page));
-  return abandoned_page_visit(page, vinfo); 
+  return abandoned_page_visit(page, vinfo);
 }
 
 // Visit all abandoned pages in this subproc.

From 15061be4b2fec43ed8bfaa807fd98624366d04e6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 25 Dec 2024 10:50:49 -0800
Subject: [PATCH 160/264] commit page-map within one allocation

---
 src/page-map.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/page-map.c b/src/page-map.c
index db14265b..a917175a 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -160,6 +160,7 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att
 #else
 
 // A 2-level page map
+#define MI_PAGE_MAP_SUB_SIZE    (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
 
 mi_decl_cache_align mi_page_t*** _mi_page_map;
 static void*        mi_page_map_max_address;
@@ -167,6 +168,7 @@ static mi_memid_t   mi_page_map_memid;
 
 static _Atomic(mi_bfield_t)  mi_page_map_commit; 
 
+static mi_page_t** mi_page_map_ensure_committed(size_t idx);
 static mi_page_t** mi_page_map_ensure_at(size_t idx);
 static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
 
@@ -200,16 +202,17 @@ bool _mi_page_map_init(void) {
   }
   mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0)));
 
-  // commit the first part so NULL pointers get resolved without an access violation
-  mi_page_map_ensure_at(0);
-  
-  // note: for the NULL range we only commit one OS page
-  // mi_page_map_set_range(NULL, 0, 0, 1);
-  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);
+  // note: for the NULL range we only commit one OS page (in the map and sub)
   if (!mi_page_map_memid.initially_committed) {
-    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);
+    _mi_os_commit(&_mi_page_map[0], os_page_size, NULL);  // commit first part of the map
+  }
+  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 subs at the end already
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
+  }
+  if (!mi_page_map_memid.initially_zero) {
+    _mi_page_map[0][0] = NULL;
   }
-  _mi_page_map[0][0] = NULL;
 
   mi_assert_internal(_mi_ptr_page(NULL)==NULL);
   return true;

From 7ae726bb390fe40aed6da0791d3934f59712beb9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 25 Dec 2024 13:30:42 -0800
Subject: [PATCH 161/264] small fixes

---
 include/mimalloc/types.h | 4 ++--
 src/arena.c              | 9 +++++++--
 src/options.c            | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 06db5639..8b72140a 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -139,8 +139,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 
-// Minimal commit for a page on-demand commit (should be >= OS page size, and >= MI_ARENA_SLICE_SIZE for correct stats)
-#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE
+// Minimal commit for a page on-demand commit (should be >= OS page size)
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE // (4*MI_KiB) 
 
 // ------------------------------------------------------
 // Arena's are large reserved areas of memory allocated from
diff --git a/src/arena.c b/src/arena.c
index 5cdf0d22..c8d4c9cd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -832,10 +832,15 @@ void _mi_arenas_page_free(mi_page_t* page) {
       const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
       //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices));
       mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices);
-      mi_assert_internal(total_slices > 0);
       if (total_slices > 0) {
         mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL);
       }
+      // any left over?
+      const size_t extra = page->slice_committed % MI_ARENA_SLICE_SIZE;
+      if (extra > 0) {
+        // pretend it was decommitted already
+        mi_os_stat_decrease(committed, extra);
+      }
     }
     else {
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count));
@@ -1308,7 +1313,7 @@ static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
 }
 
 static int mi_page_commit_usage(mi_page_t* page) {
-  if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100;
+  // if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100;
   const size_t committed_size = mi_page_committed(page);
   const size_t used_size = page->used * mi_page_block_size(page);
   return (int)(used_size * 100 / committed_size);
diff --git a/src/options.c b/src/options.c
index 0a9a5f92..13174798 100644
--- a/src/options.c
+++ b/src/options.c
@@ -430,7 +430,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[768];
+  char buf[992];
   if (fmt==NULL) return;
   if (!mi_recurse_enter()) return;
   _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);

From 5f13941c1859c3a08d5a5b321c7f99481ca66dae Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 25 Dec 2024 14:12:45 -0800
Subject: [PATCH 162/264] fix constructor re-initialization on subproc_main

---
 src/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 4631d9d9..4feee790 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,7 +97,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-static mi_decl_cache_align mi_subproc_t subproc_main;
+static mi_decl_cache_align mi_subproc_t subproc_main = { };  // note: empty initializer to prevent running the constructor (in C++ compilation)
 
 static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_id

From efe10513ec056d9e81f713e6c441376dcd2bbf43 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Wed, 25 Dec 2024 14:40:32 -0800
Subject: [PATCH 163/264] fix initializer warning on clang-18

---
 src/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 4feee790..81aca206 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,7 +97,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-static mi_decl_cache_align mi_subproc_t subproc_main = { };  // note: empty initializer to prevent running the constructor (in C++ compilation)
+static mi_decl_cache_align mi_subproc_t subproc_main = { 0 };  // note: empty initializer to prevent running the constructor (in C++ compilation)
 
 static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_id

From 27e0c467aefb5b7591cb291e4456823966f58344 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 25 Dec 2024 14:56:11 -0800
Subject: [PATCH 164/264] fix c++ initializer warning

---
 src/init.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 81aca206..cc96e993 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,7 +97,12 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-static mi_decl_cache_align mi_subproc_t subproc_main = { 0 };  // note: empty initializer to prevent running the constructor (in C++ compilation)
+static mi_decl_cache_align mi_subproc_t subproc_main
+#if __cplusplus
+= { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+= { 0 };   // C zero initialize
+#endif
 
 static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_id

From f72ac7a5aa85eb95f5e29a410c43a52543cfd444 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 26 Dec 2024 10:28:36 -0800
Subject: [PATCH 165/264] add attr_noexept for better codegen on msvc

---
 ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++--
 src/free.c                              | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index cb761f94..9568b2d3 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-lib.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
+    <ProjectReference Include="mimalloc-override-dll.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/src/free.c b/src/free.c
index 4d72cc7a..7467adc1 100644
--- a/src/free.c
+++ b/src/free.c
@@ -48,10 +48,10 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Forward declaration for multi-threaded collect
-static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page);
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept;
 
 // Free a block multi-threaded
-static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
 {
   // adjust stats (after padding check and potentially recursive `mi_free` above)
   mi_stat_free(page, block);    // stat_free may access the padding
@@ -195,7 +195,7 @@ void mi_free(void* p) mi_attr_noexcept
 // ------------------------------------------------------
 
 
-static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
 

From 0a7fd7eb6fa030c77ac98d6327c323b4409608f2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 26 Dec 2024 10:42:24 -0800
Subject: [PATCH 166/264] use fixed tls on windows with static linking

---
 ide/vs2022/mimalloc-lib.vcxproj         | 2 +-
 ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj
index c82dbec7..a0c8101b 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@@ -299,7 +299,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG;MI_WIN_USE_FIXED_TLS=1</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 9568b2d3..cb761f94 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override-dll.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

From e359e9b12ba39c885e122acd6177bcf5b2cb77ed Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 26 Dec 2024 10:43:10 -0800
Subject: [PATCH 167/264] merge from dev3

---
 ide/vs2022/mimalloc-lib.vcxproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj
index a0c8101b..c82dbec7 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@@ -299,7 +299,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG;MI_WIN_USE_FIXED_TLS=1</PreprocessorDefinitions>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>

From 8a4c26377f128dd3010f94076c4ab819f1076c8b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Thu, 26 Dec 2024 23:12:03 -0800
Subject: [PATCH 168/264] add neon code for bit clear

---
 include/mimalloc/bits.h |  2 ++
 src/bitmap.c            | 23 ++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 875f6230..5b847f4b 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -92,6 +92,8 @@ typedef int32_t  mi_ssize_t;
 
 #if MI_ARCH_X64 && defined(__AVX2__)
 #include <immintrin.h>
+#elif MI_ARCH_ARM64 && MI_OPT_SIMD
+#include <arm_neon.h>
 #endif
 #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
 #include <intrin.h>
diff --git a/src/bitmap.c b/src/bitmap.c
index e4a4cc2d..15ae66a0 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -573,6 +573,27 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     // try again
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  while(true) {
+    // a cache line is 64b so we can just as well load all at the same time (?)
+    const uint64x2_t vzero1_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields));        // 2x64 bit is_zero
+    const uint64x2_t vzero1_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 2));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 4));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 6));    // 2x64 bit is_zero
+    const uint32x4_t vzero1    = vuzp1q_u32(vreinterpretq_u32_u64(vzero1_lo),vreinterpretq_u32_u64(vzero1_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero2    = vuzp1q_u32(vreinterpretq_u32_u64(vzero2_lo),vreinterpretq_u32_u64(vzero2_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero1x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero1), 24));        // shift-right 2x32bit elem by 24: lo 16 bits contain the 2 lo bytes
+    const uint32x4_t vzero2x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero2), 24));        
+    const uint16x8_t vzero12   = vreinterpretq_u16_u32(vuzp1q_u32(vzero1x,vzero2x));                           // unzip even 32-bit elements into one vector
+    const uint8x8_t  vzero     = vmovn_u32(vzero12);                                                           // narrow the bottom 16-bits
+    const uint64_t mask = ~vget_lane_u64(vreinterpret_u64_u8(vzero), 0);  // 1 byte for each bfield (0xFF => bfield has a bit set)
+    if (mask==0) return false;
+    mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    const size_t chunk_idx = mi_ctz(mask) / 8;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+  }
   #else
   // try first to find a field that is not all set (to reduce fragmentation)
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -590,7 +611,7 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n,
   return mi_bchunk_try_find_and_clear(chunk, pidx);
 }
 
-#if !MI_OPT_SIMD
+#if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512))
 static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
   const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
   if (!allow_all_set && (~b == 0)) return false;

From dddcd5de16f0eb61e9ecd6f0a13e0695ddcad257 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Thu, 26 Dec 2024 23:49:38 -0800
Subject: [PATCH 169/264] add neon version for chunk_is_clear

---
 src/bitmap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 15ae66a0..03e21c89 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -868,6 +868,13 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
   return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields); 
+  const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);    
+  const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);    
+  const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);    
+  const uint64x2_t v  = vorrq_u64(vorrq_u64(v0,v1),vorrq_u64(v2,v3));    
+  return (vmaxvq_u32(vreinterpretq_u32_u64(v)) == 0);
   #else
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
@@ -876,7 +883,6 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
   #endif
 }
 
-
 static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
   for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
     i--;

From 0d302cd1749ac8025893923b1c1d77f9246199e0 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 31 Dec 2024 15:11:09 -0800
Subject: [PATCH 170/264] add comments

---
 include/mimalloc/types.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 8b72140a..c5029a14 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -100,9 +100,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-// ------------------------------------------------------
+// --------------------------------------------------------------
 // Sizes of internal data-structures
-// ------------------------------------------------------
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
+// --------------------------------------------------------------
 
 // Sizes are for 64-bit
 #ifndef MI_ARENA_SLICE_SHIFT
@@ -116,19 +117,19 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
 
-#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)
-#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps are "bchunks" of 512 bits
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 
-#define MI_ARENA_MIN_OBJ_SLICES           (1)
-#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)      // 32 MiB (for now, cannot cross chunk boundaries)
+#define MI_ARENA_MIN_OBJ_SLICES           (1)                         
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)            // 32 MiB (for now, cannot cross chunk boundaries)
 
 #define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
 #define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
 
-#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
-#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
-#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bitmap)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
@@ -272,7 +273,7 @@ typedef uint8_t mi_heaptag_t;
 //
 // Notes:
 // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
-// - If a page is not part of a heap it is called "abandoned" -- in
+// - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
 //   that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that
 //   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
 // - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
@@ -304,7 +305,7 @@ typedef struct mi_page_s {
   mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
-  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
   mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
@@ -315,7 +316,7 @@ typedef struct mi_page_s {
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
 #define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
-#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
 #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
@@ -328,7 +329,7 @@ typedef struct mi_page_s {
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)   // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with _mi_bin
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)   // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 

From c507ee3d96a2146717d6ac5fe120d1dc2da545dd Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 2 Jan 2025 11:42:28 -0800
Subject: [PATCH 171/264] make bitmap scan cross bfields for NX; disable the
 use of large object pages

---
 src/arena.c  |   6 +-
 src/bitmap.c | 287 +++++++++++++++++++++++++++++----------------------
 src/bitmap.h |   4 +-
 3 files changed, 170 insertions(+), 127 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index c8d4c9cd..11a4f82f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -773,9 +773,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
     page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
-  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
-    page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
-  }
+  //else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+  //  page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  // }
   else {
     page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment);
   }
diff --git a/src/bitmap.c b/src/bitmap.c
index 03e21c89..5cecc606 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -130,6 +130,7 @@ static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t
 }
 
 // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
+// `all_clear` is set to `true` if the new bfield became zero.
 static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
@@ -155,6 +156,7 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_cle
 
 // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
 // and false otherwise (leaving the bit field as is).
+// `all_clear` is set to `true` if the new bfield became zero.
 static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
   mi_assert_internal(mask != 0);
   mi_bfield_t old = mi_atomic_load_relaxed(b);
@@ -170,9 +172,9 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf
 }
 
 
-// Tries to set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0)
+// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
 // and `false` otherwise leaving the bfield `b` as-is.
-// `all_clear` is set to true if the new bfield is zero (and false otherwise)
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
 static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
@@ -181,6 +183,7 @@ static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx
 
 
 // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
 static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   mi_assert_internal((idx%8)==0);
@@ -190,6 +193,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t id
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
+// `all_clear` is set to true if the new bfield became zero (which is always the case if successful).
 static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
   mi_bfield_t old = mi_bfield_all_set();
   if (mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero())) {
@@ -249,26 +253,43 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfiel
 
 // ------- mi_bchunk_set ---------------------------------------
 
-static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) {
+// Set a single bit
+static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_set(&chunk->bfields[i], idx);
+  const bool was_clear = mi_bfield_atomic_set(&chunk->bfields[i], idx);
+  if (already_set != NULL) { *already_set = (was_clear ? 0 : 1); }
+  return was_clear;
 }
 
+// Set `0 < n <= MI_BFIELD_BITS`, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly). 
+// Can cross over two bfields.
 static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n > 0 && n <= MI_BFIELD_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  const mi_bfield_t mask = mi_bfield_mask(n, idx);
-  return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set);
-}
-
-static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  return mi_bfield_atomic_setX(&chunk->bfields[i], already_set);
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(n,idx), already_set);    
+  }
+  else {
+    // spanning two fields 
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    size_t already_set1;
+    const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1);
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    size_t already_set2;
+    const bool all_set2 = mi_bfield_atomic_set_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &already_set2);
+    if (already_set != NULL) { *already_set = already_set1 + already_set2; }
+    return (all_set1 && all_set2);
+  }  
 }
 
 // Set a sequence of `n` bits within a chunk.
@@ -298,6 +319,7 @@ mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk,
     // next field
     field++;
     idx = 0;
+    mi_assert_internal(m <= n);
     n -= m;
   }
   if (palready_set!=NULL) { *palready_set = total_already_set; }
@@ -307,13 +329,10 @@ mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk,
 
 static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
   mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
-  if (n==1) {
-    bool was_clear = mi_bchunk_set(chunk, cidx);
-    if (already_set != NULL) { *already_set = !was_clear; }
-    return was_clear;
-  }
-  if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
-  if (n <MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
+  if (n==1) return mi_bchunk_set(chunk, cidx, already_set);
+  // if (n==8 && (cidx%8) == 0) return mi_bchunk_set8(chunk, cidx, already_set);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
   return mi_bchunk_xsetN_(MI_BIT_SET, chunk, cidx, n, already_set, NULL);
 }
 
@@ -326,27 +345,13 @@ static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* all_cl
   return mi_bfield_atomic_clear(&chunk->bfields[i], idx, all_clear);
 }
 
-static inline bool mi_bchunk_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* all_clear) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  const mi_bfield_t mask = mi_bfield_mask(n, idx);
-  return mi_bfield_atomic_clear_mask(&chunk->bfields[i], mask, all_clear);
-}
-
-static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  mi_assert_internal((cidx%MI_BFIELD_BITS)==0);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  return mi_bfield_atomic_clearX(&chunk->bfields[i], all_clear);
-}
-
 static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
   mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
   if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear);
-  if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear);
-  if (n <MI_BFIELD_BITS) return mi_bchunk_clearNX(chunk, cidx, n, maybe_all_clear);
-  return mi_bchunk_xsetN_(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);
+  // if (n==8) return mi_bchunk_clear8(chunk, cidx, maybe_all_clear);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear); 
+  // TODO: implement mi_bchunk_xsetNX instead of setNX
+  return mi_bchunk_xsetN_(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);  
 }
 
 
@@ -380,24 +385,46 @@ static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t
   if (n==0) return true;
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  if mi_likely(n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
-  if mi_likely(n<=MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  if (n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
+  if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
   return mi_bchunk_is_xsetN_(set, chunk, i, idx, n);
 }
 
 
 // ------- mi_bchunk_try_clear  ---------------------------------------
 
+// Clear `0 < n <= MI_BITFIELD_BITS`. Can cross over a bfield boundary.
 static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
   mi_assert_internal(n <= MI_BFIELD_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  mi_assert_internal(idx + n <= MI_BFIELD_BITS);
-  const size_t mask = mi_bfield_mask(n, idx);
-  return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear);
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(n, idx), pmaybe_all_clear);
+  }
+  else {
+    // spanning two fields (todo: use double-word atomic ops?)
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    bool field1_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &field1_is_clear)) return false;
+    // try the second field as well
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    bool field2_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &field2_is_clear)) {
+      // we failed to clear the second field, restore the first one
+      mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), NULL);
+      return false;
+    }
+    if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = field1_is_clear && field2_is_clear;  }
+    return true;
+  }
 }
 
+// Clear a full aligned bfield.
 static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
   mi_assert_internal(cidx < MI_BCHUNK_BITS);
   mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
@@ -405,60 +432,51 @@ static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* p
   return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
 }
 
-// Try to atomically set/clear a sequence of `n` bits within a chunk.
-// Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
+// Try to atomically clear a sequence of `n` bits within a chunk.
+// Returns true if all bits transitioned from 1 to 0,
 // and false otherwise leaving all bit fields as is.
-// Note: this is a hard one as we need to unwind partial atomic operations
-// if we fail halfway..
+// Note: this is the complex one as we need to unwind partial atomic operations if we fail halfway..
+// `maybe_all_clear` is set to `true` if all the bfields involved become zero.
 mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
   mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
   mi_assert_internal(n>0);
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = true; }
   if (n==0) return true;
-  size_t start_idx = cidx % MI_BFIELD_BITS;
-  size_t start_field = cidx / MI_BFIELD_BITS;
-  size_t end_field = MI_BCHUNK_FIELDS;
-  mi_bfield_t mask_mid = 0;
-  mi_bfield_t mask_end = 0;
-  bool field_is_clear;
-  bool maybe_all_clear = true;
-  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = false; }
-
+  
   // first field
+  const size_t start_idx = cidx % MI_BFIELD_BITS;
+  const size_t start_field = cidx / MI_BFIELD_BITS;
   size_t field = start_field;
-  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
+  size_t m = MI_BFIELD_BITS - start_idx;   // m are the bits to clear in this field
   if (m > n) { m = n; }
   mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
   mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
   const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
-  if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &field_is_clear)) return false;
-  maybe_all_clear = maybe_all_clear && field_is_clear;
+  bool maybe_all_clear;
+  if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &maybe_all_clear)) return false;  
 
   // done?
+  mi_assert_internal(m <= n);
   n -= m;
-  if (n==0) {
-    if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
-    return true;
-  }
-
-  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
-
-  // mid fields
+  
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields  
+  // mid fields?
   while (n >= MI_BFIELD_BITS) {
     field++;
     mi_assert_internal(field < MI_BCHUNK_FIELDS);
-    mask_mid = mi_bfield_all_set();
-    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_mid, &field_is_clear)) goto restore;
+    bool field_is_clear;
+    if (!mi_bfield_atomic_try_clearX(&chunk->bfields[field], &field_is_clear)) goto restore;
     maybe_all_clear = maybe_all_clear && field_is_clear;
     n -= MI_BFIELD_BITS;
   }
 
-  // last field
+  // last field?
   if (n > 0) {
     mi_assert_internal(n < MI_BFIELD_BITS);
     field++;
     mi_assert_internal(field < MI_BCHUNK_FIELDS);
-    end_field = field;
-    mask_end = mi_bfield_mask(n, 0);
+    const mi_bfield_t mask_end = mi_bfield_mask(n, 0);
+    bool field_is_clear;
     if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
     maybe_all_clear = maybe_all_clear && field_is_clear;
   }
@@ -467,12 +485,16 @@ mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t ci
   return true;
 
 restore:
-  // field is on the field that failed to set atomically; we need to restore all previous fields
+  // `field` is the index of the field that failed to set atomically; we need to restore all previous fields
   mi_assert_internal(field > start_field);
   while( field > start_field) {
     field--;
-    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
-    mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, NULL);
+    if (field == start_field) {
+      mi_bfield_atomic_set_mask(&chunk->bfields[field], mask_start, NULL);
+    }
+    else {
+      mi_bfield_atomic_setX(&chunk->bfields[field], NULL);  // mid-field: set all bits again
+    }
   }
   return false;
 }
@@ -480,8 +502,8 @@ restore:
 
 static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
   mi_assert_internal(n>0);
-  if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
-  if (n<MI_BFIELD_BITS)  return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
   return mi_bchunk_try_clearN_(chunk, cidx, n, maybe_all_clear);
 }
 
@@ -634,7 +656,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
 }
 #endif
 
-// find least byte in a chunk with all bits set, and try unset it atomically
+// find least aligned byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
@@ -682,7 +704,7 @@ static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n,
 }
 
 
-// find least bfield in a chunk with all bits set, and try unset it atomically
+// find least aligned bfield in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find large size pages in the free blocks.
 // todo: try neon version
@@ -729,23 +751,24 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
   return mi_bchunk_try_find_and_clearX(chunk, pidx);
 }
 
-// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
+// find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set,
 // and try to clear them atomically.
-// Currently does not cross bfield boundaries.
 // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
-// (We do not cross bfield boundaries)
+// will cross bfield boundaries.
 mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BFIELD_BITS) return false;
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
+  // for all fields in the chunk
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
     size_t idx;
+    // is there a range inside the field?
     while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      if (idx + n > MI_BFIELD_BITS) break;
+      if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field
 
       const size_t bmask = mask<<idx;
       mi_assert_internal(bmask>>idx == mask);
-      if ((b&bmask) == bmask) { // found a match        
+      if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
         if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) {
           *pidx = (i*MI_BFIELD_BITS) + idx;
           mi_assert_internal(*pidx < MI_BCHUNK_BITS);
@@ -753,7 +776,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
           return true;
         }
         else {
-          // if failed to atomically commit, reload b and try again from this position
+          // if we failed to atomically commit, reload b and try again from the start
           b = mi_atomic_load_acquire(&chunk->bfields[i]);
         }
       }
@@ -764,6 +787,25 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
         b = b & ~mi_bfield_mask(ones, idx);            // clear the ones
       }
     }
+
+    // check if we can cross into the next bfield
+    if (i < MI_BCHUNK_FIELDS-1) {
+      const size_t post = mi_bfield_clz(~b);
+      if (post > 0) {
+        const size_t pre = mi_bfield_ctz(mi_atomic_load_relaxed(&chunk->bfields[i+1]));
+        if (post + pre <= n) {
+          // it fits -- try to claim it atomically 
+          const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
+          if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {
+            // we cleared all atomically
+            *pidx = cidx;
+            mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+            mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
+            return true;
+          }
+        }
+      }      
+    }
   }
   return false;
 }
@@ -775,46 +817,47 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
 static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
 
-  const size_t skip_count = n/MI_BFIELD_BITS;
+  // we first scan ahead to see if there is a range of `n` set bits, and only then try to clear atomically
+  mi_assert_internal(n>0);
+  const size_t skip_count = (n-1)/MI_BFIELD_BITS;
   size_t cidx;
-  for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++)
+  for (size_t i = 0; i < MI_BCHUNK_FIELDS - skip_count; i++)
   {
     size_t m = n;   // bits to go
 
     // first field
     mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
     size_t ones = mi_bfield_clz(~b);
-    cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones);  // start index
+    cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - ones);  // start index
     if (ones >= m) {
       // we found enough bits!
       m = 0;
     }
     else {
       m -= ones;
-      mi_assert_internal(m>0);
-    }
-
-    // keep scanning further fields?
-    size_t j = 1;   // field count from i
-    while (i+j < MI_BCHUNK_FIELDS) {
-      mi_assert_internal(m > 0);
-      b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
-      ones = mi_bfield_ctz(~b);
-      if (ones >= m) {
-        // we found enough bits
-        m = 0;
-        break;
-      }
-      else if (ones == MI_BFIELD_BITS) {
-        // not enough yet, proceed to the next field
-        j++;
-        m -= MI_BFIELD_BITS;
-      }
-      else {
-        // the range was not enough, start from scratch
-        i = i + j - 1;  // no need to re-scan previous fields, except the last one (with clz this time)
-        mi_assert_internal(m>0);
-        break;
+      
+      // keep scanning further fields?
+      size_t j = 1;   // field count from i
+      while (i+j < MI_BCHUNK_FIELDS) {
+        mi_assert_internal(m > 0);
+        b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
+        ones = mi_bfield_ctz(~b);
+        if (ones >= m) {
+          // we found enough bits
+          m = 0;
+          break;
+        }
+        else if (ones == MI_BFIELD_BITS) {
+          // not enough yet, proceed to the next field
+          j++;
+          m -= MI_BFIELD_BITS;
+        }
+        else {
+          // the range was not enough, start from scratch
+          i = i + j - 1;  // no need to re-scan previous fields, except the last one (with clz this time)
+          mi_assert_internal(m>0);
+          break;
+        }
       }
     }
 
@@ -838,9 +881,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
 //static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
 //  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
 //  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
-//  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
-//  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-//  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+//  // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
+//  if (n==0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+//  if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
 //  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
 //}
 
@@ -909,7 +952,7 @@ static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
 
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
   mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
 }
 
@@ -922,7 +965,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
   // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
   // bit in the mask. We check again to catch this situation.
   if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) {
-    mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
+    mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
     return false;
   }
   mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
@@ -1018,7 +1061,7 @@ bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
   const size_t chunk_idx = idx / MI_BCHUNK_BITS;
   const size_t cidx = idx % MI_BCHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+  const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL);
   mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
   return wasclear;
 }
@@ -1235,9 +1278,9 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid
   return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
 }
 
-bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
-}
+//bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+//  return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
+//}
 
 bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
   mi_assert_internal(n<=MI_BFIELD_BITS);
@@ -1279,7 +1322,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
     else {
       // failed to claim it, set abandoned mapping again (unless the page was freed)
       if (keep_set) {
-        const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
+        const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL);
         mi_assert_internal(wasclear); MI_UNUSED(wasclear);
       }
     }
diff --git a/src/bitmap.h b/src/bitmap.h
index 16ecea07..09967fb9 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -191,7 +191,7 @@ static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
 // Specialized versions for common bit sequence sizes
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+// mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
 
@@ -200,7 +200,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_
 mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
   if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
   if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
+  // if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
   if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
   if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
   return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);

From 5e26ba6fe62e5624dd65564501ef8d2fd915e56d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 2 Jan 2025 12:14:12 -0800
Subject: [PATCH 172/264] fix debug output

---
 src/arena.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 11a4f82f..4c363a57 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1319,10 +1319,10 @@ static int mi_page_commit_usage(mi_page_t* page) {
   return (int)(used_size * 100 / committed_size);
 }
 
-static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index) {
+static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index, long* pbit_of_page, mi_ansi_color_t* pcolor_of_page ) {
   size_t bit_set_count = 0;
-  long bit_of_page = 0;
-  mi_ansi_color_t color = MI_GRAY;
+  long bit_of_page = *pbit_of_page;
+  mi_ansi_color_t color = *pcolor_of_page;
   mi_ansi_color_t prev_color = MI_GRAY;
   for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) {
     bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
@@ -1331,9 +1331,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
     if (is_set) {
       mi_assert_internal(bit_of_page <= 0);
       bit_set_count++;
-      mi_page_t* page = (mi_page_t*)start;
       c = 'p';
       color = MI_GRAY;
+      mi_page_t* page = (mi_page_t*)start;
       if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
       else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); }
       int commit_usage = mi_page_commit_usage(page);
@@ -1362,7 +1362,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
     }
     buf[*k] = c; *k += 1;
   }
-  mi_debug_color(buf, k, MI_GRAY);
+  mi_debug_color(buf, k, MI_GRAY);  
+  *pbit_of_page = bit_of_page;
+  *pcolor_of_page = color;
   return bit_set_count;
 }
 
@@ -1381,6 +1383,8 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
     else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
     else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
 
+    long bit_of_page = 0;
+    mi_ansi_color_t color_of_page = MI_GRAY;
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) {
         _mi_output_message("  %s\n\x1B[37m", buf);
@@ -1390,7 +1394,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
         if (invert) bfield = ~bfield;
-        size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count)
+        size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count, &bit_of_page, &color_of_page)
                                      : mi_debug_show_bfield(bfield, buf, &k));
         if (invert) xcount = MI_BFIELD_BITS - xcount;
         bit_set_count += xcount;

From 10b40f90fc12b1e6895555410561c07b0cba0344 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 2 Jan 2025 14:59:42 -0800
Subject: [PATCH 173/264] fix scan of NX

---
 src/bitmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 5cecc606..067faff0 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -281,6 +281,7 @@ static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, si
     const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
     mi_assert_internal(m < n);
     mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
     size_t already_set1;
     const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1);
     mi_assert_internal(n - m > 0);
@@ -792,7 +793,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
     if (i < MI_BCHUNK_FIELDS-1) {
       const size_t post = mi_bfield_clz(~b);
       if (post > 0) {
-        const size_t pre = mi_bfield_ctz(mi_atomic_load_relaxed(&chunk->bfields[i+1]));
+        const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
         if (post + pre <= n) {
           // it fits -- try to claim it atomically 
           const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);

From 34e402e128402c4d534f0513b76f54ecfaa573dd Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 2 Jan 2025 15:00:17 -0800
Subject: [PATCH 174/264] fix NX test in try_find_and_clearN

---
 src/bitmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bitmap.h b/src/bitmap.h
index 09967fb9..8ab06216 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -201,8 +201,8 @@ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t*
   if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
   if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
   // if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
-  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
+  if (n==0 || n>MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n <= MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
   return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
 }
 

From ab78d57a843476edd6e89139585a98011e107911 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 2 Jan 2025 15:19:08 -0800
Subject: [PATCH 175/264] search size bins from small to large

---
 src/bitmap.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index a03aef69..b9daf7c6 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1552,14 +1552,16 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
   const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
   const size_t cmap_cycle      = cmap_acc+1;
   const mi_bbin_t bbin = mi_bbin_of(n);
-  // visit bins from largest size bin up to the NONE bin
-  for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
-  // const mi_bbin_t bin = bbin;
+  // visit bins from smallest to largest (to reduce fragmentation on the larger blocks)
+  for(int bin = MI_BBIN_SMALL; bin <= bbin; bin++)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
+      // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // visit bins from largest size bin up to the NONE bin
   {
     mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
     {
       // don't search into non-accessed memory until we tried other size bins as well
-      if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) {
+      if (bin < bbin && cmap_idx > cmap_acc) 
+         // (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) // large to small
+      {
         break;
       }
 
@@ -1573,8 +1575,10 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
         mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
         // only in the current size class!
         const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]);
-        if // (bin >= chunk_bin) {
-           ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {
+        if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run
+           // ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {  largest to smallest
+
+        {
           mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
           size_t cidx;
           if ((*on_find)(chunk, n, &cidx)) {

From 2a75500ac2a43fc52c394181579347c7cb336965 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 08:38:36 -0800
Subject: [PATCH 176/264] disable large pages by default

---
 include/mimalloc/types.h | 15 +++++++++++++--
 src/arena.c              |  6 +++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index c5029a14..9fefdf60 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -99,6 +99,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
+// Enable large pages for objects between 128KiB and 512KiB. Disabled by default.
+#ifndef MI_ENABLE_LARGE_PAGES
+#define MI_ENABLE_LARGE_PAGES  0
+#endif
 
 // --------------------------------------------------------------
 // Sizes of internal data-structures
@@ -131,6 +135,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
 #define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)
 
+
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
 #define MI_BIN_FULL  (MI_BIN_HUGE+1)
@@ -328,8 +333,14 @@ typedef struct mi_page_s {
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)   // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+
+#if MI_ENABLE_LARGE_PAGES
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#else
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE
+#endif
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index 60046cdc..cf1836f7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -773,9 +773,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
     page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
-  //else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
-  //  page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
-  // }
+  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  }
   else {
     page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment);
   }

From bbd7a492f0f5cab84c08a0bab38151e28908a63e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 08:46:30 -0800
Subject: [PATCH 177/264] fix signedness warning

---
 src/bitmap.c |  2 +-
 src/bitmap.h | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index b9daf7c6..ce92fe3f 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1553,7 +1553,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
   const size_t cmap_cycle      = cmap_acc+1;
   const mi_bbin_t bbin = mi_bbin_of(n);
   // visit bins from smallest to largest (to reduce fragmentation on the larger blocks)
-  for(int bin = MI_BBIN_SMALL; bin <= bbin; bin++)  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
+  for(mi_bbin_t bin = MI_BBIN_SMALL; bin <= bbin; bin = mi_bbin_inc(bin))  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
       // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // visit bins from largest size bin up to the NONE bin
   {
     mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
diff --git a/src/bitmap.h b/src/bitmap.h
index 9969aec0..9afdffce 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -219,12 +219,21 @@ typedef enum mi_bbin_e {
   MI_BBIN_SMALL,    // slice_count == 1
   MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
   MI_BBIN_MEDIUM,   // slice_count == 8
-  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- not used for now!
+  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- only used if MI_ENABLE_LARGE_PAGES is 1
   MI_BBIN_COUNT
 } mi_bbin_t;
 
-static inline mi_bbin_t mi_bbin_of(size_t n) {
-  return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER)); // (n==64 ? MI_BBIN_LARGE : MI_BBIN_OTHER)));
+static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
+  return (mi_bbin_t)((int)bbin + 1);
+}
+
+static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
+  if (slice_count==1) return MI_BBIN_SMALL;
+  if (slice_count==8) return MI_BBIN_MEDIUM;
+  #if MI_ENABLE_LARGE_PAGES
+  if (slice_count==MI_BFIELD_BITS) return MI_BBIN_LARGE;
+  #endif
+  return MI_BBIN_OTHER;
 }
 
 // An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes

From 281a513642df30ef0ee54b047fe12e64499e7a44 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 08:48:06 -0800
Subject: [PATCH 178/264] fix initialization warning on gcc

---
 src/bitmap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 067faff0..6b371aed 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1181,7 +1181,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   while(_bcount##SUF > 0) { \
     _bcount##SUF--;\
     if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \
-    size_t name_idx; \
+    /* size_t name_idx; */ \
     bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \
     mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \
     { \
@@ -1221,11 +1221,13 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si
   mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
   const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
   const size_t cmap_cycle      = cmap_acc+1;
+  size_t cmap_idx = 0;
   mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
   {
     // and for each chunkmap entry we iterate over its bits to find the chunks
     mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]);
     size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    size_t eidx = 0;
     mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
     {
       mi_assert_internal(eidx <= MI_BFIELD_BITS);

From b6adbbca0cb02f7796112903c2b154e678ba2cce Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 13:15:46 -0800
Subject: [PATCH 179/264] combine flags and xthread_id

---
 include/mimalloc/internal.h | 25 ++++++++++++++++---------
 include/mimalloc/prim.h     | 23 +++++++++++++++--------
 include/mimalloc/types.h    | 11 ++++++-----
 src/alloc.c                 |  2 +-
 src/free.c                  | 31 ++++++++++++++++---------------
 src/init.c                  |  3 +--
 src/page-map.c              |  8 +++-----
 7 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e83186e8..e175f331 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -622,11 +622,16 @@ static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
 }
 
 
-// Thread id of thread that owns this page
-static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+// Thread id of thread that owns this page (with flags in the bottom 2 bits)
+static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) {
   return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
 }
 
+// Plain thread id of the thread that owns this page
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK);
+}
+
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
   return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
@@ -695,19 +700,21 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) <= 1);
+  return (mi_page_xthread_id(page) <= MI_PAGE_IS_ABANDONED_MAPPED);
 }
 
 static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
-  return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) == 1);
+  return (mi_page_xthread_id(page) == MI_PAGE_IS_ABANDONED_MAPPED);
 }
 
 static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
-  mi_atomic_or_relaxed(&page->xthread_id, (uintptr_t)1);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_atomic_or_relaxed(&page->xthread_id, MI_PAGE_IS_ABANDONED_MAPPED);
 }
 
 static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
-  mi_atomic_and_relaxed(&page->xthread_id, ~(uintptr_t)1);
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  mi_atomic_and_relaxed(&page->xthread_id, ~MI_PAGE_IS_ABANDONED_MAPPED);
 }
 
 
@@ -766,15 +773,15 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 // Page flags
 //-----------------------------------------------------------
 static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
-  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xflags);
+  return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK);
 }
 
 static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
   if (set) {
-    mi_atomic_or_relaxed(&page->xflags, newflag);
+    mi_atomic_or_relaxed(&page->xthread_id, newflag);
   }
   else {
-    mi_atomic_and_relaxed(&page->xflags, ~newflag);
+    mi_atomic_and_relaxed(&page->xthread_id, ~newflag);
   }
 }
 
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 687729c5..8043fd7f 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -270,35 +270,42 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 
 // defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  const mi_threadid_t tid = __mi_prim_thread_id();
+  mi_assert_internal(tid > 1);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
+  return tid;
+}
 
 // Get a unique id for the current thread.
 #if defined(MI_PRIM_THREAD_ID)
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
 }
 
 #elif defined(_WIN32)
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
 }
 
 #elif MI_USE_BUILTIN_THREAD_POINTER
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   // Works on most Unix based platforms with recent compilers
   return (uintptr_t)__builtin_thread_pointer();
 }
 
 #elif MI_HAS_TLS_SLOT
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
     // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
@@ -314,7 +321,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 #else
 
 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)&_mi_heap_default;
 }
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 9fefdf60..1cab7742 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -241,14 +241,16 @@ typedef struct mi_block_s {
 } mi_block_t;
 
 
-// The `in_full` and `has_aligned` page flags are put in the same field
-// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+// The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`)
 // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
 // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
-#define MI_PAGE_IN_FULL_QUEUE  MI_ZU(0x01)
-#define MI_PAGE_HAS_ALIGNED    MI_ZU(0x02)
+#define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
+#define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)
+#define MI_PAGE_FLAG_MASK             MI_ZU(0x07)
 typedef size_t mi_page_flags_t;
 
+
 // Thread free list.
 // Points to a list of blocks that are freed by other threads.
 // The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
@@ -296,7 +298,6 @@ typedef struct mi_page_s {
 
   mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
-  _Atomic(mi_page_flags_t)  xflags;            // `in_full_queue` and `has_aligned` flags
 
   size_t                    block_size;        // size available in each block (always `>0`)
   uint8_t*                  page_start;        // start of the blocks
diff --git a/src/alloc.c b/src/alloc.c
index 6b037987..9cd44338 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -272,7 +272,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
   // if p == NULL then behave as malloc.
   // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
   // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  const size_t size = (p==NULL ? 0 : _mi_usable_size(p,"mi_realloc")); 
   if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
     mi_assert_internal(p!=NULL);
     // todo: do not track as the usable size is still the same in the free; adjust potential padding?
diff --git a/src/free.c b/src/free.c
index 7467adc1..f63a55cb 100644
--- a/src/free.c
+++ b/src/free.c
@@ -122,6 +122,7 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
+  if (p==NULL) return;  // a NULL pointer is seen as abandoned (tid==0) with a full flag set
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_block_check_unguard(page, block, p);
   mi_free_block_mt(page, block);
@@ -160,24 +161,24 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 void mi_free(void* p) mi_attr_noexcept
 {
   mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
-  if mi_unlikely(page==NULL) return;
 
-  const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
-  const mi_page_flags_t flags = mi_page_flags(page);
-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, p);
-    }
+  #if MI_PAGE_MAP_FLAT                  // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt`
+  if mi_unlikely(page==NULL) return;
+  #endif
+
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
+  if mi_likely(xtid == 0) {                        // thread-local free?  `tid==mi_page_thread_id(page) && mi_page_flags(page)==0`
+    // thread-local, aligned, and not a full page
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+  }
+  else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0`
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
+    mi_free_generic_local(page, p);
   }
   else {
-    // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-    if mi_likely(flags == 0) {
+      // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
+    if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
       // blocks are aligned (and not a full page)
       mi_block_t* const block = (mi_block_t*)p;
       mi_free_block_mt(page,block);
diff --git a/src/init.c b/src/init.c
index 439a914c..c697a1e9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  MI_ATOMIC_VAR_INIT(0),  // xthread_id
+  MI_ATOMIC_VAR_INIT(MI_PAGE_IN_FULL_QUEUE),  // xthread_id  (must set flag to catch NULL on a free)
   NULL,                   // free
   0,                      // used
   0,                      // capacity
@@ -25,7 +25,6 @@ const mi_page_t _mi_page_empty = {
   0,                      // retire_expire
   NULL,                   // local_free
   MI_ATOMIC_VAR_INIT(0),  // xthread_free
-  MI_ATOMIC_VAR_INIT(0),  // xflags
   0,                      // block_size
   NULL,                   // page_start
   0,                      // heap tag
diff --git a/src/page-map.c b/src/page-map.c
index a917175a..1cf0b07b 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -210,11 +210,9 @@ bool _mi_page_map_init(void) {
   if (!mi_page_map_memid.initially_committed) {
     _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
   }
-  if (!mi_page_map_memid.initially_zero) {
-    _mi_page_map[0][0] = NULL;
-  }
-
-  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  _mi_page_map[0][0] = (mi_page_t*)&_mi_page_empty;       // caught in `mi_free`
+  
+  mi_assert_internal(_mi_ptr_page(NULL)==&_mi_page_empty);
   return true;
 }
 

From f6c2550eac92710b23c7b5af3bb2e20bccd2cc96 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 3 Jan 2025 13:50:31 -0800
Subject: [PATCH 180/264] fix enable large pages

---
 include/mimalloc/types.h | 8 +-------
 src/arena.c              | 2 ++
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 1cab7742..089ed199 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -334,14 +334,8 @@ typedef struct mi_page_s {
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
-
-#if MI_ENABLE_LARGE_PAGES
-#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 64 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
-#else
-#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE
-#endif
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index cf1836f7..0c571c96 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -773,9 +773,11 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc
   else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
     page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
   }
+  #if MI_ENABLE_LARGE_PAGES
   else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
     page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
   }
+  #endif
   else {
     page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment);
   }

From 6099f76c8c9f67c815ba147506451008616d9282 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 14:26:32 -0800
Subject: [PATCH 181/264] nicer logic in free

---
 src/free.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/free.c b/src/free.c
index f63a55cb..5efe0280 100644
--- a/src/free.c
+++ b/src/free.c
@@ -176,18 +176,16 @@ void mi_free(void* p) mi_attr_noexcept
     // page is local, but is full or contains (inner) aligned blocks; use generic path
     mi_free_generic_local(page, p);
   }
-  else {
-      // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-    if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
-      // blocks are aligned (and not a full page)
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_mt(page,block);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic multi-thread path
-      mi_free_generic_mt(page, p);
-    }
+  // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
+    // blocks are aligned (and not a full page)
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_mt(page,block);
   }
+  else {
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
+    mi_free_generic_mt(page, p);
+  }  
 }
 
 

From c95d9865a876e598d54b53fe293f0a348926517e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 14:27:18 -0800
Subject: [PATCH 182/264] merge from dev3-bin

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index ce045173..3363c68a 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose

From e14c8fc795cd1a8ef21605225b3d556e74b434f7 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 18:08:34 -0800
Subject: [PATCH 183/264] bump version to 3.0.0

---
 azure-pipelines.yml                 | 23 -----------------------
 cmake/mimalloc-config-version.cmake |  6 +++---
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index e1a199d3..5393035e 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -331,26 +331,3 @@ jobs:
     workingDirectory: $(BuildType)
     displayName: CTest
 
-- job:
-  displayName: macOS 12 (Monterey)
-  pool:
-    vmImage:
-      macOS-12
-  strategy:
-    matrix:
-      Debug:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
-      Release:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
-  steps:
-  - task: CMake@1
-    inputs:
-      workingDirectory: $(BuildType)
-      cmakeArgs: .. $(cmakeExtraArgs)
-  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
-    displayName: Make
-  - script: ctest --verbose --timeout 180
-    workingDirectory: $(BuildType)
-    displayName: CTest
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index f92d52e6..04f27e6d 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
-set(mi_version_major 1)
-set(mi_version_minor 8)
-set(mi_version_patch 8)
+set(mi_version_major 3)
+set(mi_version_minor 0)
+set(mi_version_patch 0)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})

From 46ae913f22c46fc60006d0fcb0829d078a2dea76 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 3 Jan 2025 18:43:38 -0800
Subject: [PATCH 184/264] bump version to 3.0.1 for further development

---
 cmake/mimalloc-config-version.cmake |  2 +-
 include/mimalloc.h                  |  2 +-
 include/mimalloc/types.h            | 13 +++++++------
 src/free.c                          | 10 +++++-----
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 04f27e6d..60cc2d3d 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 3)
 set(mi_version_minor 0)
-set(mi_version_patch 0)
+set(mi_version_patch 1)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 7383ce8a..fb7efcde 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 300   // major + 2 digits minor
+#define MI_MALLOC_VERSION 301   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index f13149b1..ec4144d1 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -241,9 +241,10 @@ typedef struct mi_block_s {
 } mi_block_t;
 
 
-// The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`)
+// The page flags are put in the bottom 3 bits of the thread_id (for a fast test in `mi_free`)
 // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
 // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
+// `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped")
 #define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
 #define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
 #define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)
@@ -253,10 +254,9 @@ typedef size_t mi_page_flags_t;
 
 // Thread free list.
 // Points to a list of blocks that are freed by other threads.
-// The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// The least-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
 // Ownership is required before we can read any non-atomic fields in the page.
-// This way we can push a block on the thread free list and try to claim ownership
-// atomically in `free.c:mi_free_block_mt`.
+// This way we can push a block on the thread free list and try to claim ownership atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 
 // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
@@ -281,13 +281,14 @@ typedef uint8_t mi_heaptag_t;
 // Notes:
 // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
 // - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
-//   that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that
+//   that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that
 //   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
+// - page flags are in the bottom 3 bits of `xthread_id` for the fast path in `mi_free`.
 // - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
 
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id (or 0 if abandoned) | page_flags)
 
   mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
   uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
diff --git a/src/free.c b/src/free.c
index 5efe0280..ed1b830e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -167,18 +167,18 @@ void mi_free(void* p) mi_attr_noexcept
   #endif
 
   const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
-  if mi_likely(xtid == 0) {                        // thread-local free?  `tid==mi_page_thread_id(page) && mi_page_flags(page)==0`
+  if mi_likely(xtid == 0) {                        // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0`
     // thread-local, aligned, and not a full page
     mi_block_t* const block = (mi_block_t*)p;
     mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
   }
-  else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0`
+  else if (xtid <= MI_PAGE_FLAG_MASK) {            // `tid == mi_page_thread_id(page) && mi_page_flags(page) != 0`
     // page is local, but is full or contains (inner) aligned blocks; use generic path
     mi_free_generic_local(page, p);
   }
-  // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
-    // blocks are aligned (and not a full page)
+  // free-ing in a page owned by a heap in another thread, or an abandoned page (not belonging to a heap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {      // `tid != mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // blocks are aligned (and not a full page); push on the thread_free list
     mi_block_t* const block = (mi_block_t*)p;
     mi_free_block_mt(page,block);
   }

From fab6bee76406340d8582e15530380a6cdde954a3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 4 Jan 2025 22:39:06 -0800
Subject: [PATCH 185/264] nicer arena debug output

---
 src/arena.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 2dae0fb5..64b1327f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1317,12 +1317,7 @@ typedef enum mi_ansi_color_e {
 } mi_ansi_color_t;
 
 static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
-  buf[*k] = '\x1b';
-  buf[*k+1] = '[';
-  buf[*k+2] = (char)(((int)color / 10) + '0');
-  buf[*k+3] = (char)(((int)color % 10) + '0');
-  buf[*k+4] = 'm';
-  *k += 5;
+  *k += _mi_snprintf(buf + *k, 32, "\x1B[%dm", (int)color);
 }
 
 static int mi_page_commit_usage(mi_page_t* page) {
@@ -1347,13 +1342,14 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
       c = 'p';
       color = MI_GRAY;
       mi_page_t* page = (mi_page_t*)start;
-      if (mi_page_is_abandoned_mapped(page)) { c = 'a'; }
-      else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); }
+      if (mi_page_is_singleton(page)) { c = 's'; }
+      else if (mi_page_is_full(page)) { c = 'f'; }
+      if (!mi_page_is_abandoned(page)) { c = _mi_toupper(c); }
       int commit_usage = mi_page_commit_usage(page);
       if (commit_usage < 25) { color = MI_MAROON; }
       else if (commit_usage < 50) { color = MI_ORANGE; }
       else if (commit_usage < 75) { color = MI_TEAL; }
-      else color = MI_DARKGREEN;
+      else color = MI_DARKGREEN; 
       bit_of_page = (long)page->memid.mem.arena.slice_count;
     }
     else {
@@ -1476,7 +1472,7 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept {
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     //}
     if (show_pages) {
-      page_total += mi_debug_show_bitmap_binned("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena);
+      page_total += mi_debug_show_bitmap_binned("pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved) (chunk bin: S:small, M:medium, L:large, X:other)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena);
     }
   }
   // if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);

From c518312fb6d67505b64e93759839c177cb9e6c0d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 4 Jan 2025 22:49:25 -0800
Subject: [PATCH 186/264] allow narrow arena debug output

---
 include/mimalloc.h |  2 +-
 src/arena.c        | 29 +++++++++++------------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index fb7efcde..281f5ead 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef void* mi_arena_id_t;
diff --git a/src/arena.c b/src/arena.c
index 64b1327f..55b6fb9b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1377,10 +1377,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
   return bit_set_count;
 }
 
-#define MI_FIELDS_PER_LINE  (4)
-
-static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) {
-  _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header);
+static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+  _mi_output_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3);
+  const size_t fields_per_line = (narrow ? 2 : 4);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) {
@@ -1408,7 +1407,7 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_
     long bit_of_page = 0;
     mi_ansi_color_t color_of_page = MI_GRAY;
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
-      if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) {
+      if (j > 0 && (j % fields_per_line) == 0) {
         // buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7;
         _mi_output_message("  %s\n\x1B[37m", buf);
         _mi_memzero(buf, sizeof(buf));
@@ -1435,20 +1434,11 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_
   return bit_set_count;
 }
 
-//static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) {
-//  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], NULL, invert, arena);
-//}
-
-static size_t mi_debug_show_bitmap_binned(const char* header, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) {
-  return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena);
+static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+  return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
 }
 
-//static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) {
-//  return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], &bbitmap->chunk_bins[0], invert, arena);
-//}
-
-
-void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept {
+void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept {
   mi_subproc_t* subproc = _mi_subproc();
   size_t max_arenas = mi_arenas_get_count(subproc);
   //size_t free_total = 0;
@@ -1472,7 +1462,10 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept {
     //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
     //}
     if (show_pages) {
-      page_total += mi_debug_show_bitmap_binned("pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved) (chunk bin: S:small, M:medium, L:large, X:other)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena);
+      const char* header1 = "pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)";
+      const char* header2 = (narrow ? "\n      " : " ");
+      const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)";
+      page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena, narrow);
     }
   }
   // if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);

From 18244cebc5e40aef6c7a8377c0885f316a993f20 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 5 Jan 2025 11:03:41 -0800
Subject: [PATCH 187/264] refine MI_ENABLE_LARGE_PAGES

---
 include/mimalloc/types.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ec4144d1..1d3c7b07 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -270,16 +270,19 @@ typedef uint8_t mi_heaptag_t;
 // The `local_free` and `thread_free` lists are migrated to the `free` list
 // when it is exhausted. The separate `local_free` list is necessary to
 // implement a monotonic heartbeat. The `thread_free` list is needed for
-// avoiding atomic operations in the common case.
+// avoiding atomic operations when allocating from the owning thread.
 //
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
-// We don't count `freed` (as |free|) but use `used` to reduce
+// We don't count "freed" (as |free|) but use only the `used` field to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
+// Use `_mi_page_free_collect` to collect the thread_free list and update the `used` count.
 //
 // Notes:
-// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
+// - Non-atomic fields can only be accessed if having _ownership_ (low bit of `xthread_free` is 1).
+//   Combining the `thread_free` list with an ownership bit allows a concurrent `free` to atomically
+//   free an object and (re)claim ownership if the page was abandoned.
 // - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
 //   that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that
 //   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
@@ -288,17 +291,17 @@ typedef uint8_t mi_heaptag_t;
 // - Using `uint16_t` does not seem to slow things down
 
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id (or 0 if abandoned) | page_flags)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `heap->thread_id (or 0 if abandoned) | page_flags`)
 
   mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
   uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
-  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
+  uint16_t                  capacity;          // number of blocks committed
   uint16_t                  reserved;          // number of blocks reserved in memory
   uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t                   retire_expire;     // expiration count for retired blocks
 
   mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads (= `mi_block_t* | (1 if owned)`)
 
   size_t                    block_size;        // size available in each block (always `>0`)
   uint8_t*                  page_start;        // start of the blocks
@@ -333,13 +336,16 @@ typedef struct mi_page_s {
 #endif
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
-// (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#if MI_ENABLE_LARGE_PAGES
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#else
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#endif
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
-
 #if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
 #error "mimalloc internal: define more bins"
 #endif
@@ -352,7 +358,7 @@ typedef struct mi_page_s {
 typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
-  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages (if `MI_ENABLE_LARGE_PAGES==1`)
   MI_PAGE_SINGLETON // page containing a single block.
                     // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
 } mi_page_kind_t;

From a9324a2f2fe2e58f42b05c69a3e3cb291d7bc1b2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 5 Jan 2025 11:06:37 -0800
Subject: [PATCH 188/264] merge from dev3

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index c1144616..9fcc6ef3 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose

From bbe81101db67b02c97037b1a1b5c17c688aee6f8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 5 Jan 2025 11:12:27 -0800
Subject: [PATCH 189/264] add comment

---
 include/mimalloc/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 1d3c7b07..e45da9a7 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -99,7 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
-// Enable large pages for objects between 128KiB and 512KiB. Disabled by default.
+// Enable large pages for objects between 64KiB and 256KiB. 
+// Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages.
 #ifndef MI_ENABLE_LARGE_PAGES
 #define MI_ENABLE_LARGE_PAGES  0
 #endif

From bd3392466b151767ad449f82007091d693e685ae Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 5 Jan 2025 11:39:42 -0800
Subject: [PATCH 190/264] remove mi_debug_show_arenas parameter

---
 include/mimalloc.h | 2 +-
 src/arena.c        | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 281f5ead..10695def 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef void* mi_arena_id_t;
diff --git a/src/arena.c b/src/arena.c
index 55b6fb9b..f7e7b44a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1438,7 +1438,7 @@ static size_t mi_debug_show_bitmap_binned(const char* header1, const char* heade
   return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
 }
 
-void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept {
+static void mi_debug_show_arenas_ex(bool show_pages, bool narrow) mi_attr_noexcept {
   mi_subproc_t* subproc = _mi_subproc();
   size_t max_arenas = mi_arenas_get_count(subproc);
   //size_t free_total = 0;
@@ -1473,6 +1473,10 @@ void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept {
   if (show_pages)     _mi_output_message("total pages in arenas: %zu\n", page_total);
 }
 
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  mi_debug_show_arenas_ex(true /* show pages */, false /* narrow? */);
+}
+
 
 /* -----------------------------------------------------------
   Reserve a huge page arena.

From 8210c9aa0a3508075e99148227e507c4aaafad2c Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 5 Jan 2025 15:47:52 -0800
Subject: [PATCH 191/264] bump version for further development

---
 cmake/mimalloc-config-version.cmake | 2 +-
 include/mimalloc.h                  | 2 +-
 test/test-stress.c                  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 60cc2d3d..527b1874 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 3)
 set(mi_version_minor 0)
-set(mi_version_patch 1)
+set(mi_version_patch 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 10695def..8b453247 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 301   // major + 2 digits minor
+#define MI_MALLOC_VERSION 302   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
diff --git a/test/test-stress.c b/test/test-stress.c
index 1f66460f..fb27a786 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -261,9 +261,9 @@ static void test_stress(void) {
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) {
       printf("- iterations left: %3d\n", ITER - (n + 1));
-      mi_debug_show_arenas(true);
+      mi_debug_show_arenas();
       //mi_collect(true);
-      //mi_debug_show_arenas(true);
+      //mi_debug_show_arenas();
     }
     #endif
   }

From 1b5399c965d00901f0303d28d822d0589c190acb Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 5 Jan 2025 15:50:07 -0800
Subject: [PATCH 192/264] set default purge delay to 0

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index c1144616..9fcc6ef3 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose

From 86550d09bcf845034a81fb46acbab42dcaf26d23 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 13:19:44 -0800
Subject: [PATCH 193/264] set more conservative options with increased medium
 and small object sizes

---
 include/mimalloc/types.h | 8 ++++----
 src/options.c            | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e45da9a7..613bc69c 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -337,13 +337,13 @@ typedef struct mi_page_s {
 #endif
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 16 KiB
 #if MI_ENABLE_LARGE_PAGES
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #else
-#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/4)   // <= 128 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #endif
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
diff --git a/src/options.c b/src/options.c
index 9fcc6ef3..3d34d9b6 100644
--- a/src/options.c
+++ b/src/options.c
@@ -169,8 +169,8 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 2,   UNINIT, MI_OPTION(page_full_retain) },
+  { 0,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(page_full_retain) },
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },
   { MI_DEFAULT_PAGEMAP_COMMIT, 

From b2cdf81e8e2bde09e8c2eb1325f5bfcc3f9e32f9 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 13:34:45 -0800
Subject: [PATCH 194/264] comment

---
 src/bitmap.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 80bc8ff7..ff1a139f 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1420,9 +1420,16 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
 
 
 /* --------------------------------------------------------------------------------
- binned bitmap chunkmap
+ binned bitmap used to track free slices
 -------------------------------------------------------------------------------- */
 
+// Assign a specific size bin to a chunk
+static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) {
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin);
+}
+
+// Track the index of the highest chunk that is accessed.
 static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
   size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
   if mi_unlikely(chunk_idx > oldmax) {
@@ -1430,12 +1437,13 @@ static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx)
   }
 }
 
+// Set a bit in the chunkmap
 static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) {
   mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
   if (check_all_set) {
     if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) {
       // all slices are free in this chunk: return back to the NONE bin
-      mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE);
+      mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, MI_BBIN_NONE);
     }
   }
   mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
@@ -1449,7 +1457,7 @@ static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_id
   // clear the chunkmap bit
   mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL);
   // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
-  // bit in the mask. We check again to catch this situation.
+  // bit in the mask. We check again to catch this situation. (note: mi_bchunk_clear must be acq-rel)
   if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) {
     mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
     return false;
@@ -1458,12 +1466,6 @@ static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_id
   return true;
 }
 
-// Assign from the NONE bin to a specific size bin
-static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) {
-  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
-  mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin);
-}
-
 
 /* --------------------------------------------------------------------------------
   mi_bbitmap_setN, try_clearN, and is_xsetN

From dd4b4a36b1b7ad9bdb4c394cd5a51439a5d62772 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 17:42:42 -0800
Subject: [PATCH 195/264] use standard heap_collect every 10k generic
 allocations, disable reclaim_on_free by default

---
 src/heap.c    |  2 +-
 src/options.c |  4 ++--
 src/page.c    | 16 ++++------------
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index abb36da4..b744c153 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -123,7 +123,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect arenas (this is program wide so don't force purges on abandonment of threads)  
   //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); 
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, heap->tld);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
diff --git a/src/options.c b/src/options.c
index 9fcc6ef3..8d66b320 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -169,7 +169,7 @@ static mi_option_desc_t options[_mi_option_last] =
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
   { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 2,   UNINIT, MI_OPTION(page_full_retain) },
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },
diff --git a/src/page.c b/src/page.c
index 2af89c66..7e52d68f 100644
--- a/src/page.c
+++ b/src/page.c
@@ -436,7 +436,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
   heap->page_retired_max = max;
 }
 
-
+/*
 static void mi_heap_collect_full_pages(mi_heap_t* heap) {
   // note: normally full pages get immediately abandoned and the full queue is always empty
   // this path is only used if abandoning is disabled due to a destroy-able heap or options
@@ -457,15 +457,8 @@ static void mi_heap_collect_full_pages(mi_heap_t* heap) {
     page = next;
   }
 }
+*/
 
-static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) {
-  // call potential deferred free routines
-  _mi_deferred_free(heap, false);
-  // collect retired pages
-  _mi_heap_collect_retired(heap, false);
-  // collect full pages that had concurrent free's
-  mi_heap_collect_full_pages(heap);
-}
 
 /* -----------------------------------------------------------
   Initialize the initial free list in a page.
@@ -921,14 +914,13 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   // collect every N generic mallocs
   if mi_unlikely(heap->generic_count++ > 10000) {
     heap->generic_count = 0;
-    mi_heap_generic_collect(heap);
+    mi_heap_collect(heap, false /* force? */);
   }
 
   // find (or allocate) a page of the right size
   mi_page_t* page = mi_find_page(heap, size, huge_alignment);
   if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
-    mi_heap_generic_collect(heap);
-    mi_heap_collect(heap, true /* force */);
+    mi_heap_collect(heap, true /* force? */);
     page = mi_find_page(heap, size, huge_alignment);
   }
 

From 061ef80de7c6240c756c6786f73dfbfeba2e006c Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 21:39:11 -0800
Subject: [PATCH 196/264] clarify allow_destroy

---
 src/free.c |  4 ++--
 src/heap.c | 14 +++++++-------
 src/init.c |  1 +
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/free.c b/src/free.c
index ed1b830e..5d9628f0 100644
--- a/src/free.c
+++ b/src/free.c
@@ -250,8 +250,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
   if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
-    !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
-    _mi_arenas_page_try_reabandon_to_mapped(page))
+      !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
+      _mi_arenas_page_try_reabandon_to_mapped(page))
   {
     return;
   }
diff --git a/src/heap.c b/src/heap.c
index b744c153..6d5e328e 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -167,7 +167,7 @@ mi_heap_t* mi_heap_get_backing(void) {
 }
 
 // todo: make order of parameters consistent (but would that break compat with CPython?)
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, uint8_t heap_tag, mi_tld_t* tld)
 {
   mi_assert_internal(heap!=NULL);
   mi_memid_t memid = heap->memid;
@@ -175,15 +175,15 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   heap->memid = memid;
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
   heap->exclusive_arena    = _mi_arena_from_id(arena_id);
-  heap->allow_page_reclaim = !noreclaim;
-  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0);
+  heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free));
+  heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
   heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
-    // (but abandoning is good in this case)
     heap->allow_page_reclaim = false;
-    // and halve the full page retain (possibly to 0)
+    // .. but abandoning is good in this case: quarter the full page retain (possibly to 0)
+    // (so blocked threads do not hold on to too much memory)
     if (heap->full_page_retain >= 0) {
       heap->full_page_retain = heap->full_page_retain / 4;
     }
@@ -236,12 +236,12 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
-  return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id);
+  return mi_heap_new_ex(0 /* default heap tag */, false /* allow destroy? */, arena_id);
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
   // don't reclaim abandoned memory or otherwise destroy is unsafe
-  return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none());
+  return mi_heap_new_ex(0 /* default heap tag */, true /* allow destroy? */, _mi_arena_id_none());
 }
 
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
diff --git a/src/init.c b/src/init.c
index 1d352248..40d6143f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -259,6 +259,7 @@ static void mi_heap_main_init(void) {
     //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
     //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
+    heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   }

From 57eee51f46b4a5710468259c3251157900f9abcd Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 21:42:30 -0800
Subject: [PATCH 197/264] rename full_page_retain to page_full_retain for
 consistency with the option

---
 include/mimalloc/types.h | 2 +-
 src/heap.c               | 6 +++---
 src/init.c               | 2 +-
 src/page.c               | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e45da9a7..c61b0498 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -426,7 +426,7 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   size_t                generic_count;                       // how often is mimalloc_generic invoked?
   mi_heap_t*            next;                                // list of heaps per thread
-  long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
+  long                  page_full_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
diff --git a/src/heap.c b/src/heap.c
index 6d5e328e..82ca05cb 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -177,15 +177,15 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
   heap->exclusive_arena    = _mi_arena_from_id(arena_id);
   heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free));
   heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
-  heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+  heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
     heap->allow_page_reclaim = false;
     // .. but abandoning is good in this case: quarter the full page retain (possibly to 0)
     // (so blocked threads do not hold on to too much memory)
-    if (heap->full_page_retain >= 0) {
-      heap->full_page_retain = heap->full_page_retain / 4;
+    if (heap->page_full_retain >= 0) {
+      heap->page_full_retain = heap->page_full_retain / 4;
     }
   }
 
diff --git a/src/init.c b/src/init.c
index 40d6143f..ac49d292 100644
--- a/src/init.c
+++ b/src/init.c
@@ -261,7 +261,7 @@ static void mi_heap_main_init(void) {
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
-    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+    heap_main.page_full_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index 7e52d68f..d2d6a854 100644
--- a/src/page.c
+++ b/src/page.c
@@ -680,7 +680,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   size_t count = 0;
   #endif
   long candidate_limit = 0;          // we reset this on the first candidate to limit the search
-  long full_page_retain = heap->full_page_retain;
+  long page_full_retain = heap->page_full_retain;
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 
@@ -703,8 +703,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     // if the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     if (!immediate_available && !mi_page_is_expandable(page)) {
-      full_page_retain--;
-      if (full_page_retain < 0) {
+      page_full_retain--;
+      if (page_full_retain < 0) {
         mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
         mi_page_to_full(page, pq);
       }

From 0caf80ec3c59de23dc5865de34d321df22e40fa4 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 7 Jan 2025 21:50:55 -0800
Subject: [PATCH 198/264] default purge delay to 100ms

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index 3d34d9b6..a920fdcb 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 0,   UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 100, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose

From d9065115cd4c82e81df545a7417935ee3b86a93c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 13 Jan 2025 14:49:06 -0800
Subject: [PATCH 199/264] fix netBSD compilation (issue #988)

---
 src/prim/unix/prim.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 4c4a013e..8ef0bd72 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -201,7 +201,8 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
   void* p = NULL;
   #if defined(MAP_ALIGNED)  // BSD
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    size_t n = mi_bsr(try_alignment);
+    size_t idx;
+    size_t n = mi_bsr(try_alignment, &idx);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
       p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
       if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {

From c9d623a2ef325258eaa692234d1139481eea0cf2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 13 Jan 2025 16:02:35 -0800
Subject: [PATCH 200/264] add INTERFACE_INCLUDE_DIRECTORIES to vckpcg wrapper

---
 contrib/vcpkg/readme.md                 | 2 +-
 contrib/vcpkg/vcpkg-cmake-wrapper.cmake | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/vcpkg/readme.md b/contrib/vcpkg/readme.md
index b1f6047c..014f2867 100644
--- a/contrib/vcpkg/readme.md
+++ b/contrib/vcpkg/readme.md
@@ -9,7 +9,7 @@ to check out a specific commit, version, or branch of mimalloc, or set further o
 You can install such custom port as:
 
 ```sh
-$ vcpkg install mimalloc[override] --recurse --overlay-ports=./contrib/vcpkg
+$ vcpkg install "mimalloc[override]" --recurse --overlay-ports=./contrib/vcpkg
 ```
 
 This will also show the correct sha512 hash if you use a custom version.
diff --git a/contrib/vcpkg/vcpkg-cmake-wrapper.cmake b/contrib/vcpkg/vcpkg-cmake-wrapper.cmake
index 1b355722..6b917347 100644
--- a/contrib/vcpkg/vcpkg-cmake-wrapper.cmake
+++ b/contrib/vcpkg/vcpkg-cmake-wrapper.cmake
@@ -17,4 +17,5 @@ endif()
 if(TARGET mimalloc-static AND NOT TARGET mimalloc)
   add_library(mimalloc INTERFACE IMPORTED)
   set_target_properties(mimalloc PROPERTIES INTERFACE_LINK_LIBRARIES mimalloc-static)
+  set_target_properties(mimalloc PROPERTIES INTERFACE_INCLUDE_DIRECTORIES mimalloc-static)
 endif()

From e4befd1ce820c6988210156e291d5021e263b5d3 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 13 Jan 2025 17:02:02 -0800
Subject: [PATCH 201/264] vcpkg: bump sha

---
 contrib/vcpkg/portfile.cmake | 4 ++--
 contrib/vcpkg/vcpkg.json     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake
index f5f39009..55f0172f 100644
--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@@ -5,11 +5,11 @@ vcpkg_from_github(
 
   # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1).
   # REF "v${VERSION}"
-  REF 866ce5b89db1dbc3e66bbf89041291fd16329518
+  REF 6a89f8554eaab8d8d00e17b5b09f79e1d8dbf61b
 
   # The sha512 is the hash of the tar.gz bundle.
   # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=<dir of this file>` and copy the sha from the error message.)
-  SHA512 0b0e5ff823c49b9534b8c32800679806c5d7c29020af058da043c3e6e36ae3c32a1cdd5a21ece97dd60bc7dd4703967f683beac435dbb8514638a6cc55e5dea8
+  SHA512 32b87a3195efcc558b83a546348a8fb544fed335cdd6c9f8e7e9d0e8e64540fdcf1f4aa57fd0e783b78731518f4810292b832227d7e7665bf8426f1e6ce96f9d
 )
 
 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json
index bdbe9ba1..95d3b15d 100644
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "mimalloc",
-  "version": "1.9.2",
+  "version": "3.0.2",
   "port-version": 2,
   "description": "Compact general purpose allocator with excellent performance",
   "homepage": "https://github.com/microsoft/mimalloc",

From bc10fe27c657d72fb26592f78d31e9d763165438 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 15 Jan 2025 11:37:20 -0800
Subject: [PATCH 202/264] fix unregister from the page-map

---
 src/page-map.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/page-map.c b/src/page-map.c
index 1cf0b07b..25f8a7ec 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -296,12 +296,16 @@ void _mi_page_map_register(mi_page_t* page) {
 
 void _mi_page_map_unregister(mi_page_t* page) {
   mi_assert_internal(_mi_page_map != NULL);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  
+  if mi_unlikely(_mi_page_map == NULL) return;
   // get index and count
   size_t slice_count;
   size_t sub_idx;
   const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
   // unset the offsets
-  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
 }
 
 void _mi_page_map_unregister_range(void* start, size_t size) {

From be2cb44de44b13c5905886a16c7b46c498125321 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 15 Jan 2025 12:02:34 -0800
Subject: [PATCH 203/264] fix NULL pointer in _mi_safe_ptr_page to return a
 reference to the empty page

---
 src/page-map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/page-map.c b/src/page-map.c
index 25f8a7ec..641ab405 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -305,7 +305,7 @@ void _mi_page_map_unregister(mi_page_t* page) {
   size_t sub_idx;
   const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
   // unset the offsets
-  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
+  // mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
 }
 
 void _mi_page_map_unregister_range(void* start, size_t size) {
@@ -318,6 +318,7 @@ void _mi_page_map_unregister_range(void* start, size_t size) {
 
 mi_page_t* _mi_safe_ptr_page(const void* p) {
   if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match mi_free expectation
   size_t sub_idx;
   const size_t idx = _mi_page_map_index(p,&sub_idx);
   if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;

From 5af1eb1144bf4777495f76bfff435443e8302e7f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 15 Jan 2025 12:07:06 -0800
Subject: [PATCH 204/264] fix NULL pointer in _mi_safe_ptr_page to return a
 reference to the empty page

---
 src/page-map.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/page-map.c b/src/page-map.c
index 641ab405..be99814c 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -315,10 +315,10 @@ void _mi_page_map_unregister_range(void* start, size_t size) {
   mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
 }
 
-
+// Return the empty page for the NULL pointer to match the behaviour of `_mi_ptr_page`
 mi_page_t* _mi_safe_ptr_page(const void* p) {
   if mi_unlikely(p >= mi_page_map_max_address) return NULL;
-  if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match mi_free expectation
+  if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match `_mi_ptr_page` (see `mi_free` as well)
   size_t sub_idx;
   const size_t idx = _mi_page_map_index(p,&sub_idx);
   if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
@@ -328,7 +328,7 @@ mi_page_t* _mi_safe_ptr_page(const void* p) {
 }
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return (_mi_safe_ptr_page(p) != NULL);
+  return (p != NULL && _mi_safe_ptr_page(p) != NULL);
 }
 
 #endif

From 7b8a7107747935d059c04ee8d555dc8170057d35 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Thu, 16 Jan 2025 14:00:42 -0800
Subject: [PATCH 205/264] windows on arm threadpool detect

---
 src/prim/windows/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 5ba7aa4f..da664318 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -834,7 +834,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
 #endif
 
 bool _mi_prim_thread_is_in_threadpool(void) {
-  #if (MI_ARCH_X64 || MI_ARCH_X86)
+  #if (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64)
   if (win_major_version >= 6) {
     // check if this thread belongs to a windows threadpool
     // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>

From 899fd7694b15d31e3fb86c3d099cc6c2e4f144df Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 21 Jan 2025 19:28:43 -0800
Subject: [PATCH 206/264] fix unused function warnings; unregister pages

---
 src/bitmap.c   | 24 +++++++++++++-----------
 src/page-map.c |  4 ++--
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index ff1a139f..8a7a9442 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -153,11 +153,11 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already
   return (old==0);
 }
 
-static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
-  const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
-  if (all_clear!=NULL) { *all_clear = true; }
-  return (~old==0);
-}
+// static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
+//   const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
+//   if (all_clear!=NULL) { *all_clear = true; }
+//   return (~old==0);
+// }
 
 // ------- mi_bfield_atomic_try_clear ---------------------------------------
 
@@ -434,12 +434,12 @@ static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t
 }
 
 // Clear a full aligned bfield.
-static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
-  mi_assert_internal(cidx < MI_BCHUNK_BITS);
-  mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
-  const size_t i = cidx / MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
-}
+// static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
+//   mi_assert_internal(cidx < MI_BCHUNK_BITS);
+//   mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
+//   const size_t i = cidx / MI_BFIELD_BITS;
+//   return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
+// }
 
 // Try to atomically clear a sequence of `n` bits within a chunk.
 // Returns true if all bits transitioned from 1 to 0,
@@ -717,6 +717,7 @@ static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n,
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find large size pages in the free blocks.
 // todo: try neon version
+/*
 static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
   #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
@@ -759,6 +760,7 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
   mi_assert_internal(n==MI_BFIELD_BITS); MI_UNUSED(n);
   return mi_bchunk_try_find_and_clearX(chunk, pidx);
 }
+*/
 
 // find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set,
 // and try to clear them atomically.
diff --git a/src/page-map.c b/src/page-map.c
index be99814c..2b610935 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -298,17 +298,17 @@ void _mi_page_map_unregister(mi_page_t* page) {
   mi_assert_internal(_mi_page_map != NULL);
   mi_assert_internal(page != NULL);
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
-  mi_assert_internal(_mi_page_map != NULL);  
   if mi_unlikely(_mi_page_map == NULL) return;
   // get index and count
   size_t slice_count;
   size_t sub_idx;
   const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
   // unset the offsets
-  // mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
 }
 
 void _mi_page_map_unregister_range(void* start, size_t size) {
+  if mi_unlikely(_mi_page_map == NULL) return;
   const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
   size_t sub_idx;
   const uintptr_t idx = _mi_page_map_index(start, &sub_idx);

From 6137ae4ab8f507a8b70b722ca8f075c52338278d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 21 Jan 2025 20:12:13 -0800
Subject: [PATCH 207/264] fix page_flags

---
 include/mimalloc/internal.h | 32 +++++++++++++++++---------------
 include/mimalloc/types.h    |  2 +-
 src/arena.c                 | 10 ++++++++--
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 535fe1fb..e43d4420 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -597,19 +597,6 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
   return page->heap;
 }
 
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  if (heap != NULL) {
-    page->heap = heap;
-    page->heap_tag = heap->tag;
-    mi_atomic_store_release(&page->xthread_id, heap->tld->thread_id);
-  }
-  else {
-    page->heap = NULL;
-    mi_atomic_store_release(&page->xthread_id,0);
-  }
-}
-
-
 // Thread free flag helpers
 static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
   return (mi_block_t*)(tf & ~1);
@@ -700,11 +687,11 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
 
 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
   // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_page_xthread_id(page) <= MI_PAGE_IS_ABANDONED_MAPPED);
+  return (mi_page_thread_id(page) == 0);
 }
 
 static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
-  return (mi_page_xthread_id(page) == MI_PAGE_IS_ABANDONED_MAPPED);
+  return ((mi_page_xthread_id(page) & ~(MI_PAGE_IS_ABANDONED_MAPPED - 1)) == MI_PAGE_IS_ABANDONED_MAPPED);
 }
 
 static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
@@ -801,6 +788,21 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
   mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
 }
 
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(!mi_page_is_in_full(page));
+  const mi_page_flags_t flags = mi_page_flags(page);
+  const mi_threadid_t tid = (heap != NULL ? heap->tld->thread_id : 0) | flags;  // for MI_PAGE_HAS_ALIGNED
+  if (heap != NULL) {
+    page->heap = heap;
+    page->heap_tag = heap->tag;
+  }
+  else {
+    page->heap = NULL;
+  }
+  mi_atomic_store_release(&page->xthread_id, tid);
+}
+
+
 /* -------------------------------------------------------------------
   Guarded objects
 ------------------------------------------------------------------- */
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 7e968e10..2a1702ff 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -248,7 +248,7 @@ typedef struct mi_block_s {
 // `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped")
 #define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
 #define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
-#define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)
+#define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)   // must be highest flag (see `internal.h:mi_page_is_abandoned_mapped`)
 #define MI_PAGE_FLAG_MASK             MI_ZU(0x07)
 typedef size_t mi_page_flags_t;
 
diff --git a/src/arena.c b/src/arena.c
index bcde865e..e111a417 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1833,9 +1833,15 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   // find accessed size
   size_t asize;
   // scan the commit map for the highest entry
+  // scan the commit map for the highest entry
   size_t idx;
-  if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
-    asize = (idx + 1)* MI_ARENA_SLICE_SIZE;
+  //if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
+  //  asize = (idx + 1)* MI_ARENA_SLICE_SIZE;
+  //}
+  if (mi_bitmap_bsr(arena->pages, &idx)) {
+    mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, idx);
+    const size_t page_slice_count = page->memid.mem.arena.slice_count;
+    asize = mi_size_of_slices(idx + page_slice_count);
   }
   else {
     asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;

From 3f6d286a088c726b96a38d38bed6000249b098bf Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 21 Jan 2025 20:38:02 -0800
Subject: [PATCH 208/264] fix bug in page flag set that would keep pages
 abandoned

---
 include/mimalloc/internal.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e43d4420..d96cfa4c 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -790,11 +790,12 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(!mi_page_is_in_full(page));
-  const mi_page_flags_t flags = mi_page_flags(page);
-  const mi_threadid_t tid = (heap != NULL ? heap->tld->thread_id : 0) | flags;  // for MI_PAGE_HAS_ALIGNED
+  // only the aligned flag is retained (and in particular clear the abandoned-mapped flag).
+  const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0);
+  const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags;
   if (heap != NULL) {
     page->heap = heap;
-    page->heap_tag = heap->tag;
+    page->heap_tag = heap->tag;    
   }
   else {
     page->heap = NULL;

From 570b6b5a7a4509cf659b38ff032eeedb58923db2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 21 Jan 2025 20:53:16 -0800
Subject: [PATCH 209/264] slightly better bsf

---
 include/mimalloc/bits.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 5b847f4b..64875e9d 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -205,9 +205,8 @@ static inline size_t mi_ctz(size_t x) {
   #elif mi_has_builtinz(ctz)
     return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
   #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
-    if (x==0) return MI_SIZE_BITS;
-    size_t r;
-    __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    size_t r = MI_SIZE_BITS;  // bsf leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
+    __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc");
     return r;
   #elif MI_HAS_FAST_POPCOUNT
     return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);

From 5946e9cebf8e713fc17d23417cc6c34acf6cd76f Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 21 Jan 2025 20:58:45 -0800
Subject: [PATCH 210/264] fix assert

---
 include/mimalloc/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index d96cfa4c..01373025 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -789,7 +789,7 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 }
 
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  mi_assert_internal(!mi_page_is_in_full(page));
+  // mi_assert_internal(!mi_page_is_in_full(page));  // can happen when destroying pages on heap_destroy
   // only the aligned flag is retained (and in particular clear the abandoned-mapped flag).
   const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0);
   const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags;

From 7703d14e8c3cf47140270b00e10cefcc4eea18cd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Wed, 22 Jan 2025 11:21:22 -0800
Subject: [PATCH 211/264] redefine abandoned mapped as a special thread id

---
 include/mimalloc/internal.h | 215 +++++++++++++++++-------------------
 include/mimalloc/types.h    |  12 +-
 2 files changed, 110 insertions(+), 117 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 01373025..8e7ed5e9 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -597,45 +597,6 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
   return page->heap;
 }
 
-// Thread free flag helpers
-static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
-  return (mi_block_t*)(tf & ~1);
-}
-static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
-  return ((tf & 1) == 1);
-}
-static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
-  return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0));
-}
-
-
-// Thread id of thread that owns this page (with flags in the bottom 2 bits)
-static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) {
-  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
-}
-
-// Plain thread id of the thread that owns this page
-static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
-  return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK);
-}
-
-// Thread free access
-static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
-}
-
-// Owned?
-static inline bool mi_page_is_owned(const mi_page_t* page) {
-  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
-}
-
-
-//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
-//  return mi_tf_make(mi_tf_block(tf),delayed);
-//}
-//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
-//  return mi_tf_make(block, mi_tf_delayed(tf));
-//}
 
 // are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
@@ -644,12 +605,6 @@ static inline bool mi_page_all_free(const mi_page_t* page) {
   return (page->used == 0);
 }
 
-// are there any available blocks?
-static inline bool mi_page_has_any_available(const mi_page_t* page) {
-  mi_assert_internal(page != NULL && page->reserved > 0);
-  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
-}
-
 // are there immediately available blocks, i.e. blocks available on the free list.
 static inline bool mi_page_immediate_available(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
@@ -685,25 +640,6 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
   return (page->reserved - page->used <= frac);
 }
 
-static inline bool mi_page_is_abandoned(const mi_page_t* page) {
-  // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
-  return (mi_page_thread_id(page) == 0);
-}
-
-static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
-  return ((mi_page_xthread_id(page) & ~(MI_PAGE_IS_ABANDONED_MAPPED - 1)) == MI_PAGE_IS_ABANDONED_MAPPED);
-}
-
-static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
-  mi_assert_internal(mi_page_is_abandoned(page));
-  mi_atomic_or_relaxed(&page->xthread_id, MI_PAGE_IS_ABANDONED_MAPPED);
-}
-
-static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
-  mi_assert_internal(mi_page_is_abandoned_mapped(page));
-  mi_atomic_and_relaxed(&page->xthread_id, ~MI_PAGE_IS_ABANDONED_MAPPED);
-}
-
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
   return (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
@@ -717,6 +653,109 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
 }
 
 
+//-----------------------------------------------------------
+// Page thread id and flags
+//-----------------------------------------------------------
+
+// Thread id of thread that owns this page (with flags in the bottom 2 bits)
+static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) {
+  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
+}
+
+// Plain thread id of the thread that owns this page
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK);
+}
+
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK);
+}
+
+static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  if (set) { mi_atomic_or_relaxed(&page->xthread_id, newflag); }
+      else { mi_atomic_and_relaxed(&page->xthread_id, ~newflag); }
+}
+
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0);
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  // mi_assert_internal(!mi_page_is_in_full(page));  // can happen when destroying pages on heap_destroy
+  const mi_threadid_t tid = (heap == NULL ? MI_THREADID_ABANDONED : heap->tld->thread_id) | mi_page_flags(page);
+  if (heap != NULL) {
+    page->heap = heap;
+    page->heap_tag = heap->tag;    
+  }
+  else {
+    page->heap = NULL;
+  }
+  mi_atomic_store_release(&page->xthread_id, tid);
+}
+
+static inline bool mi_page_is_abandoned(const mi_page_t* page) {
+  // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
+  return (mi_page_thread_id(page) <= MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
+  return (mi_page_thread_id(page) == MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_atomic_or_relaxed(&page->xthread_id, MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  mi_atomic_and_relaxed(&page->xthread_id, MI_PAGE_FLAG_MASK);
+}
+
+//-----------------------------------------------------------
+// Thread free list and ownership
+//-----------------------------------------------------------
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~1);
+}
+static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
+  return ((tf & 1) == 1);
+}
+static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
+  return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0));
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+
+// Owned?
+static inline bool mi_page_is_owned(const mi_page_t* page) {
+  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
 // Unown a page that is currently owned
 static inline void _mi_page_unown_unconditional(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
@@ -725,7 +764,6 @@ static inline void _mi_page_unown_unconditional(mi_page_t* page) {
   mi_assert_internal((old&1)==1); MI_UNUSED(old);
 }
 
-
 // get ownership if it is not yet owned
 static inline bool mi_page_try_claim_ownership(mi_page_t* page) {
   const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1);
@@ -756,53 +794,6 @@ static inline bool _mi_page_unown(mi_page_t* page) {
   return false;
 }
 
-//-----------------------------------------------------------
-// Page flags
-//-----------------------------------------------------------
-static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
-  return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK);
-}
-
-static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
-  if (set) {
-    mi_atomic_or_relaxed(&page->xthread_id, newflag);
-  }
-  else {
-    mi_atomic_and_relaxed(&page->xthread_id, ~newflag);
-  }
-}
-
-static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
-}
-
-static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
-}
-
-static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0);
-}
-
-static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
-}
-
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  // mi_assert_internal(!mi_page_is_in_full(page));  // can happen when destroying pages on heap_destroy
-  // only the aligned flag is retained (and in particular clear the abandoned-mapped flag).
-  const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0);
-  const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags;
-  if (heap != NULL) {
-    page->heap = heap;
-    page->heap_tag = heap->tag;    
-  }
-  else {
-    page->heap = NULL;
-  }
-  mi_atomic_store_release(&page->xthread_id, tid);
-}
-
 
 /* -------------------------------------------------------------------
   Guarded objects
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 2a1702ff..0bf5722b 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -242,16 +242,18 @@ typedef struct mi_block_s {
 } mi_block_t;
 
 
-// The page flags are put in the bottom 3 bits of the thread_id (for a fast test in `mi_free`)
+// The page flags are put in the bottom 2 bits of the thread_id (for a fast test in `mi_free`)
 // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
 // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
-// `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped")
 #define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
 #define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
-#define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)   // must be highest flag (see `internal.h:mi_page_is_abandoned_mapped`)
-#define MI_PAGE_FLAG_MASK             MI_ZU(0x07)
+#define MI_PAGE_FLAG_MASK             MI_ZU(0x03)
 typedef size_t mi_page_flags_t;
 
+// There are two special threadid's: 0 for abandoned threads, and 4 for abandoned & mapped threads -- 
+// abandoned-mapped pages are abandoned but also mapped in an arena so can be quickly found for reuse.
+#define MI_THREADID_ABANDONED         MI_ZU(0)
+#define MI_THREADID_ABANDONED_MAPPED  (MI_PAGE_FLAG_MASK + 1)
 
 // Thread free list.
 // Points to a list of blocks that are freed by other threads.
@@ -292,7 +294,7 @@ typedef uint8_t mi_heaptag_t;
 // - Using `uint16_t` does not seem to slow things down
 
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `heap->thread_id (or 0 if abandoned) | page_flags`)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `heap->thread_id (or 0 or 4 if abandoned) | page_flags`)
 
   mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
   uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)

From a7370dcbd21f0497bbeb666f22f2e653001ab4c4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 22 Jan 2025 12:25:02 -0800
Subject: [PATCH 212/264] fix highest allocated page for arena unload

---
 src/arena.c                |  9 +++++++--
 test/main-override-dep.cpp | 10 ++++++++++
 test/main-override-dep.h   |  1 +
 test/main-override.cpp     | 10 ++++++----
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bcde865e..4ad4bb0e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1834,8 +1834,13 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   size_t asize;
   // scan the commit map for the highest entry
   size_t idx;
-  if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
-    asize = (idx + 1)* MI_ARENA_SLICE_SIZE;
+  //if (mi_bitmap_bsr(arena->slices_committed, &idx)) {
+  //  asize = (idx + 1)* MI_ARENA_SLICE_SIZE;
+  //}
+  if (mi_bitmap_bsr(arena->pages, &idx)) {
+    mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, idx);
+    const size_t page_slice_count = page->memid.mem.arena.slice_count;
+    asize = mi_size_of_slices(idx + page_slice_count);
   }
   else {
     asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE;
diff --git a/test/main-override-dep.cpp b/test/main-override-dep.cpp
index e92f6fc4..edb57f1f 100644
--- a/test/main-override-dep.cpp
+++ b/test/main-override-dep.cpp
@@ -12,4 +12,14 @@ std::string TestAllocInDll::GetString()
 	std::string r = test;
 	delete[] test;
 	return r;
+}
+
+#include <windows.h>
+
+void TestAllocInDll::TestHeapAlloc()
+{
+	HANDLE heap = GetProcessHeap();
+	int* p = (int*)HeapAlloc(heap, 0, sizeof(int));
+	*p = 42;
+	HeapFree(heap, 0, p);
 }
\ No newline at end of file
diff --git a/test/main-override-dep.h b/test/main-override-dep.h
index 4826f25f..9d4aabfd 100644
--- a/test/main-override-dep.h
+++ b/test/main-override-dep.h
@@ -8,4 +8,5 @@ class TestAllocInDll
 {
 public:
 	__declspec(dllexport) std::string GetString();
+	__declspec(dllexport) void TestHeapAlloc();
 };
diff --git a/test/main-override.cpp b/test/main-override.cpp
index db594acc..af385992 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -37,7 +37,7 @@ static void test_thread_local();      // issue #944
 static void test_mixed1();             // issue #942
 static void test_stl_allocators();
 
-#if x_WIN32
+#if _WIN32
 #include "main-override-dep.h"
 static void test_dep();               // issue #981: test overriding in another DLL
 #else
@@ -46,8 +46,8 @@ static void test_dep() { };
 
 int main() {
   mi_stats_reset();  // ignore earlier allocations
-  various_tests();
-  test_mixed1();
+  //various_tests();
+  //test_mixed1();
 
   test_dep();
 
@@ -145,11 +145,13 @@ static bool test_stl_allocator1() {
 struct some_struct { int i; int j; double z; };
 
 
-#if x_WIN32
+#if _WIN32
 static void test_dep()
 {
   TestAllocInDll t;
   std::string s = t.GetString();
+
+  t.TestHeapAlloc();
 }
 #endif
 

From dd4b6fc0783868c6ca19a57d6fb341f92a854e1e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 31 Jan 2025 11:54:51 -0800
Subject: [PATCH 213/264] update options

---
 include/mimalloc.h | 7 +++----
 src/options.c      | 7 +++----
 src/page.c         | 2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 8b453247..46335619 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -381,11 +381,11 @@ typedef enum mi_option_e {
   mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
   mi_option_max_errors,                 // issue at most N error messages
   mi_option_max_warnings,               // issue at most N warning messages
-  mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_deprecated_max_segment_reclaim,  // max. percentage of the abandoned segments can be reclaimed per try (=10%)
   mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
   mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
-  mi_option_purge_extend_delay,
+  mi_option_deprecated_purge_extend_delay,
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
   mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
@@ -394,8 +394,7 @@ typedef enum mi_option_e {
   mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
   mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
   mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
-  mi_option_target_segments_per_thread, // experimental (=0)
-  mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_page_reclaim_on_free,       // allow to reclaim an abandoned segment on a free (=1)
   mi_option_page_full_retain,           // retain N full pages per size class (=2)
   mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
   mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
diff --git a/src/options.c b/src/options.c
index 8d66b320..7b643092 100644
--- a/src/options.c
+++ b/src/options.c
@@ -150,11 +150,11 @@ static mi_option_desc_t options[_mi_option_last] =
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
   { 32,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
   { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
+  { 10,  UNINIT, MI_OPTION(deprecated_max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
   { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
   { 1,   UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
-  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
+  { 1,   UNINIT, MI_OPTION_LEGACY(deprecated_purge_extend_delay, decommit_extend_delay) },
   { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@@ -168,8 +168,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_GUARDED_SAMPLE_RATE,
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
-  { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 0,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 2,   UNINIT, MI_OPTION(page_full_retain) },
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },
diff --git a/src/page.c b/src/page.c
index d2d6a854..af1d5072 100644
--- a/src/page.c
+++ b/src/page.c
@@ -680,7 +680,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
   size_t count = 0;
   #endif
   long candidate_limit = 0;          // we reset this on the first candidate to limit the search
-  long page_full_retain = heap->page_full_retain;
+  long page_full_retain = (pq->block_size > MI_SMALL_MAX_OBJ_SIZE ? 0 : heap->page_full_retain); // only retain small pages
   mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
 

From 274bcb61db6b7b7447db2b3b0901d7005a242f85 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 31 Jan 2025 12:11:25 -0800
Subject: [PATCH 214/264] update option names

---
 src/free.c         | 4 ++--
 src/heap.c         | 2 +-
 src/init.c         | 2 +-
 test/test-stress.c | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/free.c b/src/free.c
index 5d9628f0..865efafa 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,7 +217,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe
 
   // 2. if the page is not too full, we can try to reclaim it for ourselves
   // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
-  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
+  if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 &&
       !mi_page_is_used_at_frac(page,8) 
       // && !mi_page_is_abandoned_mapped(page)
      )
@@ -237,7 +237,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe
           (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
       {
-        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
+        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for a block_size we don't use
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
           _mi_arenas_page_unabandon(page);
           _mi_heap_page_reclaim(tagheap, page);
diff --git a/src/heap.c b/src/heap.c
index 82ca05cb..1ae7e99f 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -175,7 +175,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
   heap->memid = memid;
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
   heap->exclusive_arena    = _mi_arena_from_id(arena_id);
-  heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free));
+  heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_page_reclaim_on_free));
   heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
   heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   heap->tag        = heap_tag;
diff --git a/src/init.c b/src/init.c
index ac49d292..33c9794d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -259,7 +259,7 @@ static void mi_heap_main_init(void) {
     //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
     //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
-    heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free);
+    heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_page_reclaim_on_free);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
     heap_main.page_full_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   }
diff --git a/test/test-stress.c b/test/test-stress.c
index fb27a786..303d9f42 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -303,12 +303,12 @@ int main(int argc, char** argv) {
     mi_option_enable(mi_option_visit_abandoned);
   #endif
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
-    mi_option_set(mi_option_purge_delay,1);
+    // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    // mi_option_set(mi_option_purge_delay,1);
   #endif
   #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
     // mi_option_set(mi_option_purge_delay,-1);
-    mi_option_set(mi_option_reclaim_on_free, 0);
+    mi_option_set(mi_option_page_reclaim_on_free, 0);
   #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();

From d55fde118981a481655a679b38befda877e78192 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 31 Jan 2025 13:34:16 -0800
Subject: [PATCH 215/264] change defaults in test-stress

---
 test/test-stress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index fb27a786..f7ae6fea 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -303,8 +303,8 @@ int main(int argc, char** argv) {
     mi_option_enable(mi_option_visit_abandoned);
   #endif
   #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
-    mi_option_set(mi_option_purge_delay,1);
+    // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    // mi_option_set(mi_option_purge_delay,1);
   #endif
   #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
     // mi_option_set(mi_option_purge_delay,-1);

From 59eeeadc3473e6d38dd83bc41d317b494df1f8ef Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 4 Feb 2025 12:26:21 -0800
Subject: [PATCH 216/264] only allow page_reclaim_on_free for small block pages

---
 bin/readme.md               | 2 +-
 include/mimalloc/internal.h | 2 +-
 src/free.c                  | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/bin/readme.md b/bin/readme.md
index f08b2e87..b79157de 100644
--- a/bin/readme.md
+++ b/bin/readme.md
@@ -63,7 +63,7 @@ need a specific redirection DLL:
   mode on Windows arm64. Unfortunately we cannot run x64 code emulated on Windows arm64 with
   the x64 mimalloc override directly (since the C runtime always uses `arm64ec`). Instead:
   1. Build the program as normal for x64 and link as normal with the x64 
-     `mimalloc.lib` export library.
+     `mimalloc.dll.lib` export library.
   2. Now separately build `mimalloc.dll` in `arm64ec` mode and _overwrite_ your
      previous (x64) `mimalloc.dll` -- the loader can handle the mix of arm64ec
      and x64 code. Now use `mimalloc-redirect-arm64ec.dll` to match your new
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 8e7ed5e9..e18390a8 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -433,7 +433,7 @@ static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
   return (heap->tld->heap_backing == heap);
 }
 
-static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+static inline bool mi_heap_is_initialized(const mi_heap_t* heap) {
   mi_assert_internal(heap != NULL);
   return (heap != NULL && heap != &_mi_heap_empty);
 }
diff --git a/src/free.c b/src/free.c
index 865efafa..1a81c504 100644
--- a/src/free.c
+++ b/src/free.c
@@ -185,7 +185,7 @@ void mi_free(void* p) mi_attr_noexcept
   else {
     // page is full or contains (inner) aligned blocks; use generic multi-thread path
     mi_free_generic_mt(page, p);
-  }  
+  }
 }
 
 
@@ -218,7 +218,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe
   // 2. if the page is not too full, we can try to reclaim it for ourselves
   // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
   if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 &&
-      !mi_page_is_used_at_frac(page,8) 
+      page->block_size <= MI_SMALL_MAX_OBJ_SIZE &&                    // only for small sized blocks
+      !mi_page_is_used_at_frac(page,8)                                // and not too full
       // && !mi_page_is_abandoned_mapped(page)
      )
   {
@@ -228,11 +229,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe
     // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
     // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
     mi_heap_t* const heap = mi_prim_get_default_heap();
-    if (heap != (mi_heap_t*)&_mi_heap_empty)  // we did not already terminate our thread (can this happen?
+    if (mi_heap_is_initialized(heap))  // we did not already terminate our thread
     {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
       if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-          (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
+          (tagheap->allow_page_reclaim) &&             // and we are allowed to reclaim abandoned pages
           // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
           (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )

From db7930f961ceb781cd4e70140676e389db4576f1 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 4 Feb 2025 14:58:08 -0800
Subject: [PATCH 217/264] avoid atomics in mi_free_try_collect_mt

---
 include/mimalloc/internal.h |  3 +-
 src/free.c                  | 14 ++++---
 src/heap.c                  |  8 ++--
 src/page.c                  | 80 ++++++++++++++++++++++++-------------
 4 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e18390a8..c1e55ddc 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -199,7 +199,8 @@ void          _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 size_t        _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
 void          _mi_deferred_free(mi_heap_t* heap, bool force);
 
-void          _mi_page_free_collect(mi_page_t* page,bool force);
+void          _mi_page_free_collect(mi_page_t* page, bool force);
+void          _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head);
 void          _mi_page_init(mi_heap_t* heap, mi_page_t* page);
 
 size_t        _mi_bin_size(uint8_t bin); // for stats
diff --git a/src/free.c b/src/free.c
index 1a81c504..ebcf08ab 100644
--- a/src/free.c
+++ b/src/free.c
@@ -48,7 +48,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Forward declaration for multi-threaded collect
-static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept;
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept;
 
 // Free a block multi-threaded
 static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
@@ -69,14 +69,14 @@ static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_
   mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
   do {
     mi_block_set_next(page, block, mi_tf_block(tf_old));
-    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+    tf_new = mi_tf_create(block, true /* always use owned: try to claim it if the page is abandoned */);
   } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
 
   // and atomically try to collect the page if it was abandoned
   const bool is_owned_now = !mi_tf_is_owned(tf_old);
   if (is_owned_now) {
     mi_assert_internal(mi_page_is_abandoned(page));
-    mi_free_try_collect_mt(page);
+    mi_free_try_collect_mt(page,block);
   }
 }
 
@@ -194,18 +194,20 @@ void mi_free(void* p) mi_attr_noexcept
 // ------------------------------------------------------
 
 
-static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
 
   // we own the page now..
   // safe to collect the thread atomic free list
-  _mi_page_free_collect(page, false);  // update `used` count
+  // use the `_partly` version to avoid atomic operations since we already have the `mt_free` pointing into the thread free list
+  _mi_page_free_collect_partly(page, mt_free);
+
   #if MI_DEBUG > 1
   if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
   #endif
 
-  // 1. free if the page is free now
+  // 1. free if the page is free now  (this is updated by `_mi_page_free_collect_partly`)
   if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
diff --git a/src/heap.c b/src/heap.c
index 1ae7e99f..10c65ff2 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -115,14 +115,14 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect retired pages
   _mi_heap_collect_retired(heap, force);
-  
+
   // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); }
-  
+
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
 
-  // collect arenas (this is program wide so don't force purges on abandonment of threads)  
-  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); 
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)
+  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1);
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, heap->tld);
 }
 
diff --git a/src/page.c b/src/page.c
index af1d5072..ccb4445b 100644
--- a/src/page.c
+++ b/src/page.c
@@ -137,9 +137,39 @@ bool _mi_page_is_valid(mi_page_t* page) {
   Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 
-// Collect the local `thread_free` list using an atomic exchange.
-static void _mi_page_thread_free_collect(mi_page_t* page)
+static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
+  if (head == NULL) return;
+
+  // find the last block in the list -- also to get a proper use count (without data races)
+  size_t max_count = page->capacity; // cannot collect more than capacity
+  size_t count = 1;
+  mi_block_t* last = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page, last)) != NULL && count <= max_count) {
+    count++;
+    last = next;
+  }
+
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_error_message(EFAULT, "corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
+  // and append the current local free list
+  mi_block_set_next(page, last, page->local_free);
+  page->local_free = head;
+
+  // update counts now
+  mi_assert_internal(count <= UINT16_MAX);
+  page->used = page->used - (uint16_t)count;
+}
+
+// Collect the local `thread_free` list using an atomic exchange.
+static void mi_page_thread_free_collect(mi_page_t* page)
+{
+  // atomically capture the thread free list
   mi_block_t* head;
   mi_thread_free_t tfreex;
   mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
@@ -150,35 +180,15 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
   mi_assert_internal(head != NULL);
 
-  // find the tail -- also to get a proper count (without data races)
-  size_t max_count = page->capacity; // cannot collect more than capacity
-  size_t count = 1;
-  mi_block_t* tail = head;
-  mi_block_t* next;
-  while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) {
-    count++;
-    tail = next;
-  }
-
-  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
-  if (count > max_count) {
-    _mi_error_message(EFAULT, "corrupted thread-free list\n");
-    return; // the thread-free items cannot be freed
-  }
-
-  // and append the current local free list
-  mi_block_set_next(page,tail, page->local_free);
-  page->local_free = head;
-
-  // update counts now
-  page->used -= (uint16_t)count;
+  // and move it to the local list
+  mi_page_thread_collect_to_local(page, head);
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(page!=NULL);
 
   // collect the thread free list
-  _mi_page_thread_free_collect(page);
+  mi_page_thread_free_collect(page);
 
   // and the local free list
   if (page->local_free != NULL) {
@@ -205,6 +215,23 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(!force || page->local_free == NULL);
 }
 
+// collect elements in the thread-free list starting at `head`.
+void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
+  if (head == NULL) return;
+  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops)
+  if (next != NULL) {
+    mi_page_thread_collect_to_local(page, next);
+    if (page->local_free != NULL && page->free == NULL) {
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->free_is_zero = false;
+    }
+  }
+  if (page->used == 1) {
+    // all elements are free'd since we skipped the `head` element itself
+    _mi_page_free_collect(page, false);  // collect the final element
+  }
+}
 
 
 /* -----------------------------------------------------------
@@ -333,9 +360,8 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
     // abandon full pages
     _mi_page_abandon(page, pq);
   }
-  else {
+  else if (!mi_page_is_in_full(page)) {
     // put full pages in a heap local queue
-    if (mi_page_is_in_full(page)) return;
     mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
     _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
   }

From b0c8d86c41066832d35db85952d65f483b1fecf6 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 4 Feb 2025 15:03:27 -0800
Subject: [PATCH 218/264] refactor mi_free_try_collect_mt

---
 src/free.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/free.c b/src/free.c
index ebcf08ab..5e83ad95 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,12 +217,13 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
     return;
   }
 
+  const bool too_full = mi_page_is_used_at_frac(page, 8);  // more than 7/8th of the page is in use?
+
   // 2. if the page is not too full, we can try to reclaim it for ourselves
   // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
-  if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 &&
-      page->block_size <= MI_SMALL_MAX_OBJ_SIZE &&                    // only for small sized blocks
-      !mi_page_is_used_at_frac(page,8)                                // and not too full
-      // && !mi_page_is_abandoned_mapped(page)
+  if (!too_full &&
+      _mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 &&
+      page->block_size <= MI_SMALL_MAX_OBJ_SIZE         // only for small sized blocks
      )
   {
     // the page has still some blocks in use (but not too many)
@@ -252,7 +253,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
   }
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!too_full &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
       !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
       _mi_arenas_page_try_reabandon_to_mapped(page))
   {

From 8fc8da5d81bcee92650752d473603ea42a6fb203 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 4 Feb 2025 17:54:49 -0800
Subject: [PATCH 219/264] use thread local stats for abandoned statistics to
 reduce contention

---
 include/mimalloc/internal.h |  3 ++-
 include/mimalloc/types.h    | 13 ++++++++++---
 src/arena.c                 | 25 +++++++++++++------------
 src/init.c                  | 12 ++++++++++++
 src/page.c                  |  2 +-
 src/stats.c                 |  6 ++++++
 6 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c1e55ddc..92f02788 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -116,6 +116,7 @@ mi_subproc_t* _mi_subproc_main(void);
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
+mi_tld_t*     _mi_thread_tld(void) mi_attr_noexcept;
 void          _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
@@ -171,7 +172,7 @@ void          _mi_arenas_unsafe_destroy_all(mi_tld_t* tld);
 
 mi_page_t*    _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
 void          _mi_arenas_page_free(mi_page_t* page);
-void          _mi_arenas_page_abandon(mi_page_t* page);
+void          _mi_arenas_page_abandon(mi_page_t* page, mi_tld_t* tld);
 void          _mi_arenas_page_unabandon(mi_page_t* page);
 bool          _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page);
 
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 0bf5722b..6ed17f09 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -544,13 +544,20 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 #define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
 #define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
 
+#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+#define mi_tld_stat_adjust_increase(tld,stat,amnt,b)            __mi_stat_adjust_increase( &(tld)->stats.stat, amnt, b)
+#define mi_tld_stat_adjust_decrease(tld,stat,amnt,b)            __mi_stat_adjust_decrease( &(tld)->stats.stat, amnt, b)
+
+
 #define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
 
-#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase( &(heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase(heap->tld, stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( heap->tld, stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( heap->tld, stat, amount)
 
 #define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
 #define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
diff --git a/src/arena.c b/src/arena.c
index e111a417..ca2ea164 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -563,8 +563,9 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
       mi_assert_internal(mi_page_is_abandoned(page));
       mi_assert_internal(mi_arena_has_page(arena,page));
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-      mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1);
-      mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1);
+      mi_tld_t* tld = _mi_thread_tld();
+      mi_tld_stat_decrease( tld, pages_abandoned, 1);
+      mi_tld_stat_counter_increase( tld, pages_reclaim_on_alloc, 1);
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
@@ -855,7 +856,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
   Arena abandon
 ----------------------------------------------------------- */
 
-void _mi_arenas_page_abandon(mi_page_t* page) {
+void _mi_arenas_page_abandon(mi_page_t* page, mi_tld_t* tld) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
@@ -878,7 +879,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) {
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
     mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
-    mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
+    mi_tld_stat_increase(tld, pages_abandoned, 1);
   }
   else {
     // page is full (or a singleton), or the page is OS/externally allocated
@@ -894,7 +895,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) {
         subproc->os_abandoned_pages = page;
       }
     }
-    mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
+    mi_tld_stat_increase(tld, pages_abandoned, 1);
   }
   _mi_page_unown(page);
 }
@@ -912,10 +913,10 @@ bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) {
     return false;
   }
   else {
-    mi_subproc_t* subproc = _mi_subproc();
-    mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1);
-    mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
-    _mi_arenas_page_abandon(page);
+    mi_tld_t* tld = _mi_thread_tld();
+    mi_tld_stat_counter_increase( tld, pages_reabandon_full, 1);
+    mi_tld_stat_adjust_decrease( tld, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
+    _mi_arenas_page_abandon(page,tld);
     return true;
   }
 }
@@ -942,14 +943,14 @@ void _mi_arenas_page_unabandon(mi_page_t* page) {
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
-    mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1);
+    mi_tld_stat_decrease(_mi_thread_tld(), pages_abandoned, 1);
   }
   else {
     // page is full (or a singleton), page is OS allocated
-    mi_subproc_t* subproc = _mi_subproc();
-    mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+    mi_tld_stat_decrease(_mi_thread_tld(), pages_abandoned, 1);
     // if not an arena page, remove from the subproc os pages list
     if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_subproc_t* subproc = _mi_subproc();
       mi_lock(&subproc->os_abandoned_pages_lock) {
         if (page->prev != NULL) { page->prev->next = page->next; }
         if (page->next != NULL) { page->next->prev = page->prev; }
diff --git a/src/init.c b/src/init.c
index 33c9794d..ced30104 100644
--- a/src/init.c
+++ b/src/init.c
@@ -357,6 +357,18 @@ mi_subproc_t* _mi_subproc(void) {
 }
 
 
+mi_tld_t* _mi_thread_tld(void) mi_attr_noexcept {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL) {
+    return &tld_empty;
+  }
+  else {
+    return heap->tld;
+  }
+}
+
+
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
diff --git a/src/page.c b/src/page.c
index ccb4445b..dc3a6365 100644
--- a/src/page.c
+++ b/src/page.c
@@ -280,7 +280,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
     mi_page_queue_remove(pq, page);
     mi_tld_t* tld = page->heap->tld;
     mi_page_set_heap(page, NULL);
-    _mi_arenas_page_abandon(page);
+    _mi_arenas_page_abandon(page,tld);
     _mi_arenas_collect(false, false, tld); // allow purging
   }
 }
diff --git a/src/stats.c b/src/stats.c
index 057dc093..d8450a84 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -152,6 +152,12 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
   mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
   mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->pages_reclaim_on_alloc, &src->pages_reclaim_on_alloc, 1);
+  mi_stat_counter_add(&stats->pages_reclaim_on_free, &src->pages_reclaim_on_free, 1);
+  mi_stat_counter_add(&stats->pages_reabandon_full, &src->pages_reabandon_full, 1);
+  mi_stat_counter_add(&stats->pages_unabandon_busy_wait, &src->pages_unabandon_busy_wait, 1);
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {

From df172843d13e357f58ea0e2bf9a9c5b5f54ad070 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 4 Feb 2025 20:15:38 -0800
Subject: [PATCH 220/264] call page_free_collect less often from a page search

---
 src/page.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/page.c b/src/page.c
index dc3a6365..4b0c810c 100644
--- a/src/page.c
+++ b/src/page.c
@@ -175,7 +175,7 @@ static void mi_page_thread_free_collect(mi_page_t* page)
   mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
   do {
     head = mi_tf_block(tfree);
-    if (head == NULL) return; // return if the list is empty
+    if mi_likely(head == NULL) return; // return if the list is empty
     tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
   } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
   mi_assert_internal(head != NULL);
@@ -717,14 +717,16 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     count++;
     #endif
     candidate_limit--;
-
-    // collect freed blocks by us and other threads
-    _mi_page_free_collect(page, false);
-
+    
     // search up to N pages for a best candidate
 
     // is the local free list non-empty?
-    const bool immediate_available = mi_page_immediate_available(page);
+    bool immediate_available = mi_page_immediate_available(page);
+    if (!immediate_available) {
+      // collect freed blocks by us and other threads to we get a proper use count
+      _mi_page_free_collect(page, false);
+      immediate_available = mi_page_immediate_available(page);
+    }
 
     // if the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
@@ -742,7 +744,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         page_candidate = page;
         candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
       }
-      else if (mi_page_all_free(page_candidate)) {
+      else if (mi_page_all_free(page_candidate)) { 
         _mi_page_free(page_candidate, pq);
         page_candidate = page;
       }

From 1e0801dceafbb87d9049a94068ad148d67f3946e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 4 Feb 2025 20:15:38 -0800
Subject: [PATCH 221/264] call page_free_collect less often from a page search

---
 src/page.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/page.c b/src/page.c
index dc3a6365..4b0c810c 100644
--- a/src/page.c
+++ b/src/page.c
@@ -175,7 +175,7 @@ static void mi_page_thread_free_collect(mi_page_t* page)
   mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
   do {
     head = mi_tf_block(tfree);
-    if (head == NULL) return; // return if the list is empty
+    if mi_likely(head == NULL) return; // return if the list is empty
     tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
   } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
   mi_assert_internal(head != NULL);
@@ -717,14 +717,16 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     count++;
     #endif
     candidate_limit--;
-
-    // collect freed blocks by us and other threads
-    _mi_page_free_collect(page, false);
-
+    
     // search up to N pages for a best candidate
 
     // is the local free list non-empty?
-    const bool immediate_available = mi_page_immediate_available(page);
+    bool immediate_available = mi_page_immediate_available(page);
+    if (!immediate_available) {
+      // collect freed blocks by us and other threads to we get a proper use count
+      _mi_page_free_collect(page, false);
+      immediate_available = mi_page_immediate_available(page);
+    }
 
     // if the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
@@ -742,7 +744,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         page_candidate = page;
         candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
       }
-      else if (mi_page_all_free(page_candidate)) {
+      else if (mi_page_all_free(page_candidate)) { 
         _mi_page_free(page_candidate, pq);
         page_candidate = page;
       }

From 27895ce35df45276b7fbb54cb9e800df6065ddd5 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 5 Feb 2025 14:25:36 -0800
Subject: [PATCH 222/264] fix guard page size calculation in secure mode

---
 src/arena.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index ca2ea164..3e2fc583 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -160,12 +160,15 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* s
   return mi_arena_from_memid(page->memid, slice_index, slice_count);
 }
 
-static size_t mi_memid_size(mi_memid_t memid) {
-  if (memid.memkind == MI_MEM_ARENA) {
-    return memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE;
+static size_t mi_page_full_size(mi_page_t* page) {  
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    return page->memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE;
   }
-  else if (mi_memid_is_os(memid) || memid.memkind == MI_MEM_EXTERNAL) {
-    return memid.mem.os.size;
+  else if (mi_memid_is_os(page->memid) || page->memid.memkind == MI_MEM_EXTERNAL) {
+    mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page);
+    const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base;
+    mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize);
+    return (presize > page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize);
   }
   else {
     return 0;
@@ -820,7 +823,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
   // we must do this since we may later allocate large spans over this page and cannot have a guard page in between
   #if MI_SECURE >= 2
   if (!page->memid.is_pinned) {
-    _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_memid_size(page->memid));
+    _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_page_full_size(page));
   }
   #endif
 
@@ -831,7 +834,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
     mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index);
     if (page->slice_committed > 0) {
       // if committed on-demand, set the commit bits to account commit properly
-      mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed);
+      mi_assert_internal(mi_page_full_size(page) >= page->slice_committed);
       const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
       //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices));
       mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices);
@@ -849,7 +852,7 @@ void _mi_arenas_page_free(mi_page_t* page) {
       mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count));
     }
   }
-  _mi_arenas_free(page, mi_memid_size(page->memid), page->memid);
+  _mi_arenas_free(page, mi_page_full_size(page), page->memid);
 }
 
 /* -----------------------------------------------------------

From 5fbba3f20c0b28bd477cf359df9bdd6c8143e1ce Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 5 Feb 2025 14:27:36 -0800
Subject: [PATCH 223/264] fix sign of comparison

---
 src/arena.c   | 2 +-
 src/options.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 3e2fc583..78b13749 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -168,7 +168,7 @@ static size_t mi_page_full_size(mi_page_t* page) {
     mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page);
     const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base;
     mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize);
-    return (presize > page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize);
+    return (presize > (ptrdiff_t)page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize);
   }
   else {
     return 0;
diff --git a/src/options.c b/src/options.c
index 7b643092..9ebb0b6a 100644
--- a/src/options.c
+++ b/src/options.c
@@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
   { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@@ -168,7 +168,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_GUARDED_SAMPLE_RATE,
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
-  { 1,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 2,   UNINIT, MI_OPTION(page_full_retain) },
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },
   { 0,   UNINIT, MI_OPTION(max_vabits) },

From 5aa679cdee122f59a6ceac9aae8cbd4181379ef1 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 5 Feb 2025 15:41:37 -0800
Subject: [PATCH 224/264] make page_reclaim_on_free 0 by default; but allow
 reclaim_on_free if the page was originally in this heap (just as in v2 with
 the full queue)

---
 src/free.c    | 47 ++++++++++++++++++++++-------------------------
 src/heap.c    |  2 +-
 src/init.c    |  2 +-
 src/options.c | 10 +++++-----
 src/page.c    | 13 +++++++------
 5 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/src/free.c b/src/free.c
index 5e83ad95..b1827f1e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,43 +217,40 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
     return;
   }
 
-  const bool too_full = mi_page_is_used_at_frac(page, 8);  // more than 7/8th of the page is in use?
-
   // 2. if the page is not too full, we can try to reclaim it for ourselves
-  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
-  if (!too_full &&
-      _mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 &&
-      page->block_size <= MI_SMALL_MAX_OBJ_SIZE         // only for small sized blocks
-     )
+  // note: 
+  // we only reclaim if the page originated from our heap (the heap field is preserved on abandonment)
+  // to avoid claiming arbitrary object sizes and limit indefinite expansion. 
+  // this helps benchmarks like `larson`
+  const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
+  if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE)       // only for small sized blocks
   {
     // the page has still some blocks in use (but not too many)
     // reclaim in our heap if compatible, or otherwise abandon again
     // todo: optimize this check further?
     // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
     // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-    mi_heap_t* const heap = mi_prim_get_default_heap();
-    if (mi_heap_is_initialized(heap))  // we did not already terminate our thread
-    {
-      mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-      if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-          (tagheap->allow_page_reclaim) &&             // and we are allowed to reclaim abandoned pages
-          // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-          (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-         )
-      {
-        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for a block_size we don't use
-          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-          _mi_arenas_page_unabandon(page);
-          _mi_heap_page_reclaim(tagheap, page);
-          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
-          return;
-        }
+    mi_heap_t* heap = mi_prim_get_default_heap();
+    if (heap != page->heap) {                     
+      if (mi_heap_is_initialized(heap)) {               
+        heap = _mi_heap_by_tag(heap, page->heap_tag);
       }
     }
+    if (heap != NULL && heap->allow_page_reclaim &&
+        (heap == page->heap || (reclaim_on_free == 1 && !mi_page_is_used_at_frac(page, 8))) &&  // only reclaim if we were the originating heap, or if reclaim_on_free == 1 and the pages is not too full
+        _mi_arena_memid_is_suitable(page->memid,heap->exclusive_arena)  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+       ) 
+    {
+      // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+      _mi_arenas_page_unabandon(page);
+      _mi_heap_page_reclaim(heap, page);
+      mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
+      return;      
+    }
   }
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!too_full &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!mi_page_is_used_at_frac(page, 8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
       !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
       _mi_arenas_page_try_reabandon_to_mapped(page))
   {
diff --git a/src/heap.c b/src/heap.c
index 10c65ff2..5ac79996 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -175,7 +175,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
   heap->memid = memid;
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
   heap->exclusive_arena    = _mi_arena_from_id(arena_id);
-  heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_page_reclaim_on_free));
+  heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0);
   heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
   heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   heap->tag        = heap_tag;
diff --git a/src/init.c b/src/init.c
index ced30104..d5bfe935 100644
--- a/src/init.c
+++ b/src/init.c
@@ -259,7 +259,7 @@ static void mi_heap_main_init(void) {
     //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
     //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
-    heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_page_reclaim_on_free);
+    heap_main.allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
     heap_main.page_full_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
   }
diff --git a/src/options.c b/src/options.c
index 9ebb0b6a..9caffbd3 100644
--- a/src/options.c
+++ b/src/options.c
@@ -168,13 +168,13 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_GUARDED_SAMPLE_RATE,
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
-  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 2,   UNINIT, MI_OPTION(page_full_retain) },
-  { 4,   UNINIT, MI_OPTION(page_max_candidates) },
-  { 0,   UNINIT, MI_OPTION(max_vabits) },
+  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps
+  { 2,   UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
+  { 4,   UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
+  { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },
+  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page.c b/src/page.c
index 4b0c810c..2a51bea6 100644
--- a/src/page.c
+++ b/src/page.c
@@ -278,10 +278,11 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   }
   else {
     mi_page_queue_remove(pq, page);
-    mi_tld_t* tld = page->heap->tld;
-    mi_page_set_heap(page, NULL);
-    _mi_arenas_page_abandon(page,tld);
-    _mi_arenas_collect(false, false, tld); // allow purging
+    mi_heap_t* heap = page->heap;
+    mi_page_set_heap(page, NULL);  
+    page->heap = heap; // dont set heap to NULL so we can reclaim_on_free within the same heap
+    _mi_arenas_page_abandon(page, heap->tld);
+    _mi_arenas_collect(false, false, heap->tld); // allow purging
   }
 }
 
@@ -717,7 +718,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     count++;
     #endif
     candidate_limit--;
-    
+
     // search up to N pages for a best candidate
 
     // is the local free list non-empty?
@@ -744,7 +745,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
         page_candidate = page;
         candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
       }
-      else if (mi_page_all_free(page_candidate)) { 
+      else if (mi_page_all_free(page_candidate)) {
         _mi_page_free(page_candidate, pq);
         page_candidate = page;
       }

From 1657bfb453cc3c08dbe612e3499fb01f1e6d97c6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Wed, 5 Feb 2025 16:01:45 -0800
Subject: [PATCH 225/264] clarify control flow and comments in page
 reclaim_on_free

---
 src/free.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/free.c b/src/free.c
index b1827f1e..c584e150 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,17 +217,13 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
     return;
   }
 
-  // 2. if the page is not too full, we can try to reclaim it for ourselves
-  // note: 
-  // we only reclaim if the page originated from our heap (the heap field is preserved on abandonment)
-  // to avoid claiming arbitrary object sizes and limit indefinite expansion. 
-  // this helps benchmarks like `larson`
+  // 2. we can try to reclaim the page for ourselves
+  // note:  we only reclaim if the page originated from our heap (the heap field is preserved on abandonment)
+  // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson`
   const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
   if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE)       // only for small sized blocks
   {
-    // the page has still some blocks in use (but not too many)
-    // reclaim in our heap if compatible, or otherwise abandon again
-    // todo: optimize this check further?
+    // get our heap (with the right tag)
     // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
     // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
     mi_heap_t* heap = mi_prim_get_default_heap();
@@ -236,16 +232,20 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
         heap = _mi_heap_by_tag(heap, page->heap_tag);
       }
     }
-    if (heap != NULL && heap->allow_page_reclaim &&
-        (heap == page->heap || (reclaim_on_free == 1 && !mi_page_is_used_at_frac(page, 8))) &&  // only reclaim if we were the originating heap, or if reclaim_on_free == 1 and the pages is not too full
-        _mi_arena_memid_is_suitable(page->memid,heap->exclusive_arena)  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-       ) 
-    {
-      // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-      _mi_arenas_page_unabandon(page);
-      _mi_heap_page_reclaim(heap, page);
-      mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
-      return;      
+    // can we reclaim?
+    if (heap != NULL && heap->allow_page_reclaim) {
+      if (heap == page->heap ||                  // only reclaim if we were the originating heap,
+          (reclaim_on_free == 1 &&               // OR if the reclaim option across heaps is enabled
+           !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
+           _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena))  // and the memory is suitable    
+         )
+      {
+        // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+        _mi_arenas_page_unabandon(page);
+        _mi_heap_page_reclaim(heap, page);
+        mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
+        return;
+      }
     }
   }
 

From 515047b676c43b0de8a7b547716500aeea69793a Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Wed, 5 Feb 2025 20:55:21 -0800
Subject: [PATCH 226/264] improve free on macos

---
 include/mimalloc/internal.h |  4 ++--
 src/free.c                  | 31 ++++++++++++++++++++-----------
 src/page-map.c              |  8 ++++----
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 92f02788..25e30f10 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -492,7 +492,7 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
 // 2-level page map:
 // double indirection, but low commit and low virtual reserve.
 //
-// the page-map is usually 4 MiB and points to sub maps of 64 KiB.
+// the page-map is usually 4 MiB (for 48 bits virtual addresses) and points to sub maps of 64 KiB.
 // the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well)
 // one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space
 // the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 4 MiB size.
@@ -519,7 +519,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
   size_t sub_idx;
   const size_t idx = _mi_page_map_index(p, &sub_idx);
   mi_page_t** const sub = _mi_page_map[idx];
-  if mi_unlikely(sub == NULL) return NULL;
+  if mi_unlikely(sub == NULL) return (mi_page_t*)&_mi_page_empty;
   return sub[sub_idx];
 }
 
diff --git a/src/free.c b/src/free.c
index c584e150..266faad8 100644
--- a/src/free.c
+++ b/src/free.c
@@ -123,6 +123,10 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
   if (p==NULL) return;  // a NULL pointer is seen as abandoned (tid==0) with a full flag set
+  #if !MI_PAGE_MAP_FLAT
+  if (page==&_mi_page_empty) return;  // an invalid pointer may lead to using the empty page
+  #endif
+  mi_assert_internal(p!=NULL && page != NULL && page != &_mi_page_empty);
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_block_check_unguard(page, block, p);
   mi_free_block_mt(page, block);
@@ -135,10 +139,9 @@ void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p)
 }
 
 
-// Get the segment data belonging to a pointer
-// This is just a single `and` in release mode but does further checks in debug mode
-// (and secure mode) to see if this was a valid pointer.
-static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
+// Get the page belonging to a pointer
+// Does further checks in debug mode to see if this was a valid pointer.
+static inline mi_page_t* mi_validate_ptr_page(const void* p, const char* msg)
 {
   MI_UNUSED_RELEASE(msg);
   #if MI_DEBUG
@@ -146,9 +149,14 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
-  mi_page_t* const page = _mi_safe_ptr_page(p);
-  if (page == NULL && p != NULL) {
-    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (page == NULL) {
+    if (p != NULL) {
+      _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
+    }
+    #if !MI_PAGE_MAP_FLAT
+    page = (mi_page_t*)&_mi_page_empty;
+    #endif
   }
   return page;
   #else
@@ -160,12 +168,13 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
+  mi_page_t* const page = mi_validate_ptr_page(p,"mi_free");
 
-  #if MI_PAGE_MAP_FLAT                  // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt`
+  #if MI_PAGE_MAP_FLAT  // if not flat, p==NULL leads to `_mi_page_empty` which leads to `mi_free_generic_mt`
   if mi_unlikely(page==NULL) return;
   #endif
-
+  mi_assert_internal(page!=NULL);
+  
   const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
   if mi_likely(xtid == 0) {                        // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0`
     // thread-local, aligned, and not a full page
@@ -283,7 +292,7 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_page_t* const page = mi_checked_ptr_page(p,msg);
+  const mi_page_t* const page = mi_validate_ptr_page(p,msg);
   if mi_unlikely(page==NULL) return 0;
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;
diff --git a/src/page-map.c b/src/page-map.c
index 2b610935..74c22e90 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -206,7 +206,7 @@ bool _mi_page_map_init(void) {
   if (!mi_page_map_memid.initially_committed) {
     _mi_os_commit(&_mi_page_map[0], os_page_size, NULL);  // commit first part of the map
   }
-  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 subs at the end already
+  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 sub maps at the end already
   if (!mi_page_map_memid.initially_committed) {
     _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
   }
@@ -315,10 +315,10 @@ void _mi_page_map_unregister_range(void* start, size_t size) {
   mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
 }
 
-// Return the empty page for the NULL pointer to match the behaviour of `_mi_ptr_page`
+// Return NULL for invalid pointers
 mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if (p==NULL) return NULL;
   if mi_unlikely(p >= mi_page_map_max_address) return NULL;
-  if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match `_mi_ptr_page` (see `mi_free` as well)
   size_t sub_idx;
   const size_t idx = _mi_page_map_index(p,&sub_idx);
   if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
@@ -328,7 +328,7 @@ mi_page_t* _mi_safe_ptr_page(const void* p) {
 }
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return (p != NULL && _mi_safe_ptr_page(p) != NULL);
+  return (_mi_safe_ptr_page(p) != NULL);
 }
 
 #endif

From 3d767ebef69a43d5fc3fab8c16b2eaa3395371f2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Wed, 5 Feb 2025 21:20:44 -0800
Subject: [PATCH 227/264] use regular free in zone_free on macos

---
 include/mimalloc.h                 | 4 ++--
 src/options.c                      | 2 +-
 src/prim/osx/alloc-override-zone.c | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 46335619..be28f17a 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -394,8 +394,8 @@ typedef enum mi_option_e {
   mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
   mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
   mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
-  mi_option_page_reclaim_on_free,       // allow to reclaim an abandoned segment on a free (=1)
-  mi_option_page_full_retain,           // retain N full pages per size class (=2)
+  mi_option_page_reclaim_on_free,       // reclaim abandoned pages on a free (=0). -1 disallowr always, 0 allows if the page originated from the current heap, 1 allow always
+  mi_option_page_full_retain,           // retain N full (small) pages per size class (=2)
   mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
   mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
   mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
diff --git a/src/options.c b/src/options.c
index 9caffbd3..485beb48 100644
--- a/src/options.c
+++ b/src/options.c
@@ -168,7 +168,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { MI_DEFAULT_GUARDED_SAMPLE_RATE,
          UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
   { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
-  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps
+  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim abandoned pages on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps
   { 2,   UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
diff --git a/src/prim/osx/alloc-override-zone.c b/src/prim/osx/alloc-override-zone.c
index d3af170d..a8f5fbc6 100644
--- a/src/prim/osx/alloc-override-zone.c
+++ b/src/prim/osx/alloc-override-zone.c
@@ -64,7 +64,8 @@ static void* zone_valloc(malloc_zone_t* zone, size_t size) {
 
 static void zone_free(malloc_zone_t* zone, void* p) {
   MI_UNUSED(zone);
-  mi_cfree(p);
+  // mi_cfree(p);  // checked free as `zone_free` may be called with invalid pointers
+  mi_free(p); // with the page_map and pagemap_commit=1 we can use the regular free
 }
 
 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {

From 4c562f392a536fa180e48441a76881e15db6ff13 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 6 Feb 2025 11:53:22 -0800
Subject: [PATCH 228/264] allow page reclaim on free to the originating heap
 also within a threadpool

---
 src/free.c | 5 +++--
 src/heap.c | 6 +++---
 src/page.c | 6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/free.c b/src/free.c
index 266faad8..3fdb35aa 100644
--- a/src/free.c
+++ b/src/free.c
@@ -220,7 +220,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
   if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-      _mi_arenas_page_unabandon(page);
+    _mi_arenas_page_unabandon(page);
     // we can free the page directly
     _mi_arenas_page_free(page);
     return;
@@ -244,8 +244,9 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
     // can we reclaim?
     if (heap != NULL && heap->allow_page_reclaim) {
       if (heap == page->heap ||                  // only reclaim if we were the originating heap,
-          (reclaim_on_free == 1 &&               // OR if the reclaim option across heaps is enabled
+          (reclaim_on_free == 1 &&               // OR if the reclaim across heaps is allowed
            !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
+           !heap->tld->is_in_threadpool &&       //    and not part of a threadpool
            _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena))  // and the memory is suitable    
          )
       {
diff --git a/src/heap.c b/src/heap.c
index 5ac79996..daad8afc 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -181,10 +181,10 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
   heap->tag        = heap_tag;
   if (heap->tld->is_in_threadpool) {
     // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
-    heap->allow_page_reclaim = false;
-    // .. but abandoning is good in this case: quarter the full page retain (possibly to 0)
+    // this is checked in `free.c:mi_free_try_collect_mt`
+    // .. but abandoning is good in this case: halve the full page retain (possibly to 0)
     // (so blocked threads do not hold on to too much memory)
-    if (heap->page_full_retain >= 0) {
+    if (heap->page_full_retain > 0) {
       heap->page_full_retain = heap->page_full_retain / 4;
     }
   }
diff --git a/src/page.c b/src/page.c
index 2a51bea6..b3dabb41 100644
--- a/src/page.c
+++ b/src/page.c
@@ -279,7 +279,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   else {
     mi_page_queue_remove(pq, page);
     mi_heap_t* heap = page->heap;
-    mi_page_set_heap(page, NULL);  
+    mi_page_set_heap(page, NULL);
     page->heap = heap; // dont set heap to NULL so we can reclaim_on_free within the same heap
     _mi_arenas_page_abandon(page, heap->tld);
     _mi_arenas_collect(false, false, heap->tld); // allow purging
@@ -358,11 +358,11 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
 
   mi_heap_t* heap = mi_page_heap(page);
   if (heap->allow_page_abandon) {
-    // abandon full pages
+    // abandon full pages (this is the usual case in order to allow for sharing of memory between heaps)
     _mi_page_abandon(page, pq);
   }
   else if (!mi_page_is_in_full(page)) {
-    // put full pages in a heap local queue
+    // put full pages in a heap local queue (this is for heaps that cannot abandon, for example, if the heap can be destroyed)
     mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
     _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
   }

From 64aaf9d88f507c60ffc9ede4c8aea3b512867456 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Thu, 6 Feb 2025 17:08:06 -0800
Subject: [PATCH 229/264] fix performance bug in mi_bchunk_try_find
 _and_clearNX

---
 src/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 8a7a9442..d1719c3b 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -804,7 +804,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
       const size_t post = mi_bfield_clz(~b);
       if (post > 0) {
         const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
-        if (post + pre <= n) {
+        if (post + pre >= n) {
           // it fits -- try to claim it atomically
           const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
           if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {

From 7931678899281766f6fb03678928e615bfbcd571 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 6 Feb 2025 22:59:14 -0800
Subject: [PATCH 230/264] further optimize  mi_bchunk_try_find_and_clearNX

---
 include/mimalloc/bits.h |  8 ++++++--
 src/bitmap.c            | 14 ++++++++------
 src/options.c           |  2 +-
 src/page.c              |  4 ++--
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 64875e9d..d4632441 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -199,6 +199,8 @@ static inline size_t mi_ctz(size_t x) {
     size_t r;
     __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _tzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
     return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
@@ -221,6 +223,8 @@ static inline size_t mi_clz(size_t x) {
     size_t r;
     __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _lzcnt_u64(x);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
     return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
@@ -254,7 +258,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
     bool is_zero;
     __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
     return !is_zero;
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
     return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
@@ -271,7 +275,7 @@ static inline bool mi_bsr(size_t x, size_t* idx) {
     bool is_zero;
     __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
     return !is_zero;
-  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
     return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
diff --git a/src/bitmap.c b/src/bitmap.c
index d1719c3b..0b13e2ec 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -773,9 +773,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
     mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
     size_t idx;
+    
     // is there a range inside the field?
     while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field
+      if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
 
       const size_t bmask = mask<<idx;
       mi_assert_internal(bmask>>idx == mask);
@@ -792,15 +793,16 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
         }
       }
       else {
-        // advance
-        const size_t ones = mi_bfield_ctz(~(b>>idx));  // skip all ones (since it didn't fit the mask)
-        mi_assert_internal(ones>0);
-        b = b & ~mi_bfield_mask(ones, idx);            // clear the ones
+        // advance by clearing the least run of ones, for example, with n>=4, idx=2:
+        // b             = 1111 1101 1010 1100
+        // .. + (1<<idx) = 1111 1101 1011 0000
+        // .. & b        = 1111 1101 1010 0000
+        b = b & (b + (mi_bfield_one() << idx));               
       }
     }
 
     // check if we can cross into the next bfield
-    if (i < MI_BCHUNK_FIELDS-1) {
+    if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
       const size_t post = mi_bfield_clz(~b);
       if (post > 0) {
         const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
diff --git a/src/options.c b/src/options.c
index 485beb48..d1bdd716 100644
--- a/src/options.c
+++ b/src/options.c
@@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/page.c b/src/page.c
index b3dabb41..4e1f683c 100644
--- a/src/page.c
+++ b/src/page.c
@@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
   Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 
-static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
+static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
   if (head == NULL) return;
 
@@ -167,7 +167,7 @@ static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 }
 
 // Collect the local `thread_free` list using an atomic exchange.
-static void mi_page_thread_free_collect(mi_page_t* page)
+static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page)
 {
   // atomically capture the thread free list
   mi_block_t* head;

From 9053cf0cd25e7a59750eb974012c0f371ce3e312 Mon Sep 17 00:00:00 2001
From: Sergey Markelov <sergey@solidstatenetworks.com>
Date: Fri, 7 Feb 2025 12:35:59 -0700
Subject: [PATCH 231/264] prim: fix dev3 UWP build (#1005)

---
 src/prim/windows/prim.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 0916a7ea..f91925fc 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -127,9 +127,11 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   config->has_partial_free = false;
   config->has_virtual_reserve = true;
   // windows version
-  const DWORD win_version = GetVersion();
-  win_major_version = (DWORD)(LOBYTE(LOWORD(win_version)));
-  win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version)));
+  OSVERSIONINFOW version{sizeof(version)};
+  if (GetVersionExW(&version)) {
+      win_major_version = version.dwMajorVersion;
+      win_minor_version = version.dwMinorVersion;
+  }
   // get the page size
   SYSTEM_INFO si;
   GetSystemInfo(&si);

From ca25fb3d17a1326f89a13c4c01d5a6d67b973af2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 7 Feb 2025 17:38:53 -0800
Subject: [PATCH 232/264] avoid reload on clearing mask

---
 src/bitmap.c | 55 ++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 0b13e2ec..c096bd4a 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -165,25 +165,31 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already
 // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
 // and false otherwise (leaving the bit field as is).
 // `all_clear` is set to `true` if the new bfield became zero.
-static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
+static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
   mi_assert_internal(mask != 0);
-  mi_bfield_t old = mi_atomic_load_relaxed(b);
-  do {
-    if ((old&mask) != mask) {
-      // the mask bits are no longer set
-      if (all_clear != NULL) { *all_clear = (old==0); }
+  mi_assert_internal((expect & mask) == mask);
+  // try to atomically clear the mask bits
+  while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { 
+    if ((expect & mask) != mask) {
+      if (all_clear != NULL) { *all_clear = (expect == 0); }
       return false;
     }
-  } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
-  if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); }
+  } 
+  if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0);  }
   return true;
 }
 
+static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t expect = mi_atomic_load_relaxed(b);
+  return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
+}
+
 
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
 // and `false` otherwise leaving the bfield `b` as-is.
 // `all_clear` is set to true if the new bfield became zero (and false otherwise)
-static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
@@ -534,15 +540,14 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 }
 #endif
 
-static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
+static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
   mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
   // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
   // as the compiler won't reload the registers vec1 and vec2 from memory again.
   const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
   size_t idx;
-  if (!allow_allset && (~b == 0)) return false;
   if (mi_bfield_find_least_bit(b, &idx)) {           // find the least bit
-    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) {  // clear it atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx < MI_BCHUNK_BITS);
       return true;
@@ -565,7 +570,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
     // try again
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
@@ -600,7 +605,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     chunk_idx = mi_ctz(mask) / 8;
     #endif
-    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
     // try again
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
@@ -621,17 +626,13 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     if (mask==0) return false;
     mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     const size_t chunk_idx = mi_ctz(mask) / 8;
-    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
     // try again
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
   }
   #else
-  // try first to find a field that is not all set (to reduce fragmentation) (not needed for binned bitmaps)
-  // for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-  //   if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
-  // }
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true;
   }
   return false;
   #endif
@@ -643,9 +644,8 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n,
 }
 
 #if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512))
-static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
+static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
   const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-  if (!allow_all_set && (~b == 0)) return false;
   // has_set8 has low bit in each byte set if the byte in x == 0xFF
   const mi_bfield_t has_set8 =
     ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
@@ -655,7 +655,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
   if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
     mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
     mi_assert_internal((idx%8)==0);
-    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // unset the byte atomically
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) {  // unset the byte atomically
       *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
       mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
@@ -701,7 +701,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     //   if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
     // }
     for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
     }
     return false;
   #endif
@@ -771,7 +771,8 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
   const mi_bfield_t mask = mi_bfield_mask(n, 0);
   // for all fields in the chunk
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    mi_bfield_t b = b0;
     size_t idx;
     
     // is there a range inside the field?
@@ -781,7 +782,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
       const size_t bmask = mask<<idx;
       mi_assert_internal(bmask>>idx == mask);
       if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
-        if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) {
+        if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) {
           *pidx = (i*MI_BFIELD_BITS) + idx;
           mi_assert_internal(*pidx < MI_BCHUNK_BITS);
           mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
@@ -789,7 +790,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
         }
         else {
           // if we failed to atomically commit, reload b and try again from the start
-          b = mi_atomic_load_acquire(&chunk->bfields[i]);
+          b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]);
         }
       }
       else {

From 9b7914fd3fb165a8caebc3a37179eee2447ecd93 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 09:35:21 -0800
Subject: [PATCH 233/264] fix bug in mi_page_free_collect_partly where the tail
 of the free list was kept

---
 src/page.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/page.c b/src/page.c
index 4e1f683c..f25d0d9b 100644
--- a/src/page.c
+++ b/src/page.c
@@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
   Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
+static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
   if (head == NULL) return;
 
@@ -167,7 +167,7 @@ static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi
 }
 
 // Collect the local `thread_free` list using an atomic exchange.
-static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page)
+static void mi_page_thread_free_collect(mi_page_t* page)
 {
   // atomically capture the thread free list
   mi_block_t* head;
@@ -215,11 +215,17 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(!force || page->local_free == NULL);
 }
 
-// collect elements in the thread-free list starting at `head`.
+// Collect elements in the thread-free list starting at `head`. This is an optimized
+// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`.
+// 
+// `head` must be in the `xthread_free` list. It will not collect `head` itself
+// so the `used` count is not fully updated in general. However, if the `head` is
+// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true).
 void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
   if (head == NULL) return;
-  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops)
+  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops)
   if (next != NULL) {
+    mi_block_set_next(page, head, NULL);
     mi_page_thread_collect_to_local(page, next);
     if (page->local_free != NULL && page->free == NULL) {
       page->free = page->local_free;
@@ -229,6 +235,8 @@ void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
   }
   if (page->used == 1) {
     // all elements are free'd since we skipped the `head` element itself
+    mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head);
+    mi_assert_internal(mi_block_next(page,head) == NULL);
     _mi_page_free_collect(page, false);  // collect the final element
   }
 }
@@ -816,31 +824,25 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
 
 
 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
+static mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
   // mi_page_queue_t* pq = mi_page_queue(heap, size);
   mi_assert_internal(!mi_page_queue_is_huge(pq));
 
   // check the first page: we even do this with candidate search or otherwise we re-search every time
   mi_page_t* page = pq->first;
-  if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+  if mi_likely(page != NULL && mi_page_immediate_available(page)) {
+    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
     if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
       mi_page_extend_free(heap, page);
       mi_assert_internal(mi_page_immediate_available(page));
     }
-    else
-   #endif
-    {
-      _mi_page_free_collect(page,false);
-    }
-
-    if (mi_page_immediate_available(page)) {
-      page->retire_expire = 0;
-      return page; // fast path
-    }
+    #endif
+    page->retire_expire = 0;
+    return page; // fast path
+  }
+  else {
+    return mi_page_queue_find_free_ex(heap, pq, true);
   }
-
-  return mi_page_queue_find_free_ex(heap, pq, true);
 }
 
 

From bc7fe059a6d87cb01a58c8f604f5b7764813c659 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 09:35:52 -0800
Subject: [PATCH 234/264] improve performance of mi_free_collect_mt by
 specializing mi_page_unown

---
 src/free.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/free.c b/src/free.c
index 3fdb35aa..1df10728 100644
--- a/src/free.c
+++ b/src/free.c
@@ -201,7 +201,7 @@ void mi_free(void* p) mi_attr_noexcept
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
-
+static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free);
 
 static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
   mi_assert_internal(mi_page_is_owned(page));
@@ -269,7 +269,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
 
 
   // not reclaimed or free'd, unown again
-  _mi_page_unown(page);
+  // _mi_page_unown(page);
+  mi_page_unown_from_free(page, mt_free);
+}
+
+
+// release ownership of a page. This may free the page if all (other) blocks were concurrently
+// freed in the meantime. Returns true if the page was freed.
+// This is a specialized version of `mi_page_unown` to (try to) avoid calling `mi_page_free_collect` again.
+static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mt_free != NULL);
+  mi_assert_internal(page->used > 1);
+  mi_thread_free_t tf_expect = mi_tf_create(mt_free, true);
+  mi_thread_free_t tf_new    = mi_tf_create(mt_free, false);
+  while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) {
+    mi_assert_internal(mi_tf_is_owned(tf_expect));
+    while (mi_tf_block(tf_expect) != NULL) {
+      _mi_page_free_collect(page,false);  // update used
+      if (mi_page_all_free(page)) {   // it may become free just before unowning it
+        _mi_arenas_page_unabandon(page);
+        _mi_arenas_page_free(page);
+        return true;
+      }
+      tf_expect = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    mi_assert_internal(mi_tf_block(tf_expect)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  }
+  return false;
 }
 
 

From 2017181a6913e174f875c85c250dba3144ac9f04 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 09:36:09 -0800
Subject: [PATCH 235/264] improve performance of clearNX

---
 src/bitmap.c | 18 ++----------------
 src/bitmap.h |  4 ----
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index c096bd4a..623f921d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -167,14 +167,13 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already
 // `all_clear` is set to `true` if the new bfield became zero.
 static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
   mi_assert_internal(mask != 0);
-  mi_assert_internal((expect & mask) == mask);
   // try to atomically clear the mask bits
-  while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { 
+  do {
     if ((expect & mask) != mask) {
       if (all_clear != NULL) { *all_clear = (expect == 0); }
       return false;
     }
-  } 
+  } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask));
   if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0);  }
   return true;
 }
@@ -696,10 +695,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded  }
   }
   #else
-    // first skip allset fields to reduce fragmentation (not needed for binned bitmaps)
-    // for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    //   if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
-    // }
     for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
       if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
     }
@@ -892,15 +887,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
 }
 
 
-//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-//  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
-//  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
-//  // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
-//  if (n==0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-//  if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
-//  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
-//}
-
 
 // ------- mi_bchunk_clear_once_set ---------------------------------------
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 9afdffce..b17d83e5 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -271,10 +271,6 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
-// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
-bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
-
 
 // Is a sequence of n bits already all set/cleared?
 bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);

From 2048fa2d17684dde6a588a3aa444149b0cb1d842 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 09:53:00 -0800
Subject: [PATCH 236/264] fix comments

---
 include/mimalloc/types.h | 4 ++--
 src/bitmap.c             | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6ed17f09..29d6fde9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -99,7 +99,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
-// Enable large pages for objects between 64KiB and 256KiB.
+// Enable large pages for objects between 64KiB and 512KiB.
 // Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages.
 #ifndef MI_ENABLE_LARGE_PAGES
 #define MI_ENABLE_LARGE_PAGES  0
@@ -342,7 +342,7 @@ typedef struct mi_page_s {
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
 #if MI_ENABLE_LARGE_PAGES
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #else
 #define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
diff --git a/src/bitmap.c b/src/bitmap.c
index 623f921d..b458d5e8 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -184,7 +184,7 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_b
   return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
 }
 
-
+/*
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
 // and `false` otherwise leaving the bfield `b` as-is.
 // `all_clear` is set to true if the new bfield became zero (and false otherwise)
@@ -203,6 +203,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t id
   const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
+*/
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.

From 6d350ab850aa6b4cdd2539944d11899635525ee4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 11:51:18 -0800
Subject: [PATCH 237/264] set the option commit_on_demand back to 2 as we only
 do this for medium/large pages

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index d1bdd716..485beb48 100644
--- a/src/options.c
+++ b/src/options.c
@@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
+  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From c7f7c23dc15a27abb6a26e78fd7b3c073f43b388 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 8 Feb 2025 12:43:00 -0800
Subject: [PATCH 238/264] make C compatible

---
 src/prim/windows/prim.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index f91925fc..31ef0e94 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -127,10 +127,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   config->has_partial_free = false;
   config->has_virtual_reserve = true;
   // windows version
-  OSVERSIONINFOW version{sizeof(version)};
+  OSVERSIONINFOW version; _mi_memzero_var(version);
   if (GetVersionExW(&version)) {
-      win_major_version = version.dwMajorVersion;
-      win_minor_version = version.dwMinorVersion;
+    win_major_version = version.dwMajorVersion;
+    win_minor_version = version.dwMinorVersion;
   }
   // get the page size
   SYSTEM_INFO si;

From 9dd753d2c0aee48b38a56d513ae01231ca6901ac Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 8 Feb 2025 13:12:19 -0800
Subject: [PATCH 239/264] add comment

---
 src/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index b458d5e8..f3030153 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -169,7 +169,7 @@ static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi
   mi_assert_internal(mask != 0);
   // try to atomically clear the mask bits
   do {
-    if ((expect & mask) != mask) {
+    if ((expect & mask) != mask) {  // are all bits still set?
       if (all_clear != NULL) { *all_clear = (expect == 0); }
       return false;
     }

From 06ade47b05672ff33481ba4dd3d4b0f6aa7aefc2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 8 Feb 2025 23:26:45 -0800
Subject: [PATCH 240/264] fix is_huge definition

---
 include/mimalloc/internal.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 25e30f10..151c81a3 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -644,8 +644,9 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
 
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
-          (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
+  return (mi_page_is_singleton(page) && 
+          (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
+           (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)));
 }
 
 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {

From fe5258a179bcc25f010e2012df9f7ab3e52cff97 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 8 Feb 2025 23:33:16 -0800
Subject: [PATCH 241/264] change process initialization order (potential fix
 for issue #1007)

---
 src/init.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/init.c b/src/init.c
index d5bfe935..31b0d271 100644
--- a/src/init.c
+++ b/src/init.c
@@ -246,8 +246,6 @@ static void mi_tld_main_init(void) {
 // Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
   if (heap_main.cookie == 0) {
-    mi_subproc_main_init();
-    mi_tld_main_init();
     // heap
     heap_main.cookie = 1;
     #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
@@ -262,6 +260,9 @@ static void mi_heap_main_init(void) {
     heap_main.allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
     heap_main.page_full_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+
+    mi_subproc_main_init();
+    mi_tld_main_init();    
   }
 }
 
@@ -666,14 +667,16 @@ void mi_process_init(void) mi_attr_noexcept {
   if (!mi_atomic_once(&process_init)) return;
   _mi_process_is_initialized = true;
   _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
-  mi_process_setup_auto_thread_done();
-
+ 
   mi_detect_cpu_features();
-  mi_subproc_main_init();
-  mi_tld_main_init();
-  mi_heap_main_init();
   _mi_os_init();
   _mi_page_map_init();
+  mi_heap_main_init();
+  mi_tld_main_init();
+  // the following two can potentially allocate (on freeBSD for locks and thread keys)
+  mi_subproc_main_init();
+  mi_process_setup_auto_thread_done();
+
   #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif

From d8c119cc4fb1e717261a3f3a875ffeddf5528462 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 9 Feb 2025 08:56:22 -0800
Subject: [PATCH 242/264] add mi_decl_maybe_unused; fix compilation with
 OPT_SIMD (issue #1009)

---
 ide/vs2022/mimalloc-lib.vcxproj          |  8 +++++++
 ide/vs2022/mimalloc-override-dll.vcxproj |  8 +++++++
 include/mimalloc/internal.h              | 27 ++++++++++++++++--------
 src/bitmap.c                             | 19 ++++++-----------
 4 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj
index b0547769..035adf8d 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@@ -178,6 +178,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Lib>
       <AdditionalLibraryDirectories>
@@ -197,6 +198,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <PostBuildEvent>
       <Command>
@@ -224,6 +226,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <PostBuildEvent>
       <Command>
@@ -251,6 +254,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <PostBuildEvent>
       <Command>
@@ -283,6 +287,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -312,6 +317,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <LanguageStandard>stdcpp20</LanguageStandard>
       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -348,6 +354,7 @@
       <LanguageStandard>stdcpp20</LanguageStandard>
       <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
       <ExceptionHandling>Sync</ExceptionHandling>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -384,6 +391,7 @@
       <LanguageStandard>stdcpp20</LanguageStandard>
       <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
       <ExceptionHandling>Sync</ExceptionHandling>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj b/ide/vs2022/mimalloc-override-dll.vcxproj
index be69716f..3c2ef98f 100644
--- a/ide/vs2022/mimalloc-override-dll.vcxproj
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj
@@ -174,6 +174,7 @@
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -204,6 +205,7 @@
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -234,6 +236,7 @@
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -264,6 +267,7 @@
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -298,6 +302,7 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -332,6 +337,7 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -367,6 +373,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
       <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -402,6 +409,7 @@
       <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
       <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 151c81a3..a76f7baf 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -18,11 +18,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "track.h"
 #include "bits.h"
 
-#if (MI_DEBUG>0)
-#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
-#else
-#define mi_trace_message(...)
-#endif
+#define mi_decl_cache_align     mi_decl_align(64)
 
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
@@ -52,19 +48,32 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_hidden
 #endif
 
-#define mi_decl_cache_align     mi_decl_align(64)
+#if (defined(__GNUC__) && (__GNUC__ >= 7)) || defined(__clang__) // includes clang and icc
+#define mi_decl_maybe_unused    __attribute__((unused))
+#elif __cplusplus >= 201703L    // c++17
+#define mi_decl_maybe_unused    [[maybe_unused]]
+#else 
+#define mi_decl_maybe_unused
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc         extern "C"
+#else
+#define mi_decl_externc
+#endif
 
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
 #define __wasi__
 #endif
 
-#if defined(__cplusplus)
-#define mi_decl_externc       extern "C"
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
-#define mi_decl_externc
+#define mi_trace_message(...)
 #endif
 
+
 // "libc.c"
 #include <stdarg.h>
 int           _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
diff --git a/src/bitmap.c b/src/bitmap.c
index f3030153..6214980b 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -184,26 +184,23 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_b
   return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
 }
 
-/*
 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
 // and `false` otherwise leaving the bfield `b` as-is.
 // `all_clear` is set to true if the new bfield became zero (and false otherwise)
-static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   const mi_bfield_t mask = mi_bfield_one()<<idx;
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
 
-
 // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
 // `all_clear` is set to true if the new bfield became zero (and false otherwise)
-static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
   mi_assert_internal(idx < MI_BFIELD_BITS);
   mi_assert_internal((idx%8)==0);
   const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
   return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
 }
-*/
 
 // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
 // and false otherwise leaving the bit field as-is.
@@ -526,16 +523,16 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
 // ------- mi_bchunk_try_find_and_clear ---------------------------------------
 
 #if MI_OPT_SIMD && defined(__AVX2__)
-static inline __m256i mi_mm256_zero(void) {
+mi_decl_maybe_unused static inline __m256i mi_mm256_zero(void) {
   return _mm256_setzero_si256();
 }
-static inline __m256i mi_mm256_ones(void) {
+mi_decl_maybe_unused static inline __m256i mi_mm256_ones(void) {
   return _mm256_set1_epi64x(~0);
 }
-static inline bool mi_mm256_is_ones(__m256i vec) {
+mi_decl_maybe_unused static inline bool mi_mm256_is_ones(__m256i vec) {
   return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
 }
-static inline bool mi_mm256_is_zero( __m256i vec) {
+mi_decl_maybe_unused static inline bool mi_mm256_is_zero( __m256i vec) {
   return _mm256_testz_si256(vec,vec);
 }
 #endif
@@ -643,8 +640,7 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n,
   return mi_bchunk_try_find_and_clear(chunk, pidx);
 }
 
-#if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512))
-static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
+mi_decl_maybe_unused static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
   const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
   // has_set8 has low bit in each byte set if the byte in x == 0xFF
   const mi_bfield_t has_set8 =
@@ -663,7 +659,6 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
   }
   return false;
 }
-#endif
 
 // find least aligned byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.

From f3c86bd976c9f2004d86aa3ec6cf889ab71be4d4 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 9 Feb 2025 18:38:15 -0800
Subject: [PATCH 243/264] add simd test in azure pipeline

---
 azure-pipelines.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5393035e..fc00dc8c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -89,6 +89,11 @@ jobs:
         CXX: clang++
         BuildType: release-clang
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-simd-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
       Secure Clang:
         CC: clang
         CXX: clang++
@@ -148,6 +153,9 @@ jobs:
       Release:
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
       Secure:
         BuildType: secure
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON

From 11f4da7ea53a13b8bd26614ca7e418c22810e73a Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 9 Feb 2025 18:46:55 -0800
Subject: [PATCH 244/264] add simd test on windows

---
 azure-pipelines.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index fc00dc8c..2ab709ff 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -30,6 +30,10 @@ jobs:
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
         MSBuildConfiguration: Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON
+        MSBuildConfiguration: Release
       Secure:
         BuildType: secure
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON

From a1cb38b70ebc1c9517cc003d52910d1de2d8d2b4 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 9 Feb 2025 19:10:33 -0800
Subject: [PATCH 245/264] fix link error in debug mode in test

---
 test/test-stress.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test-stress.c b/test/test-stress.c
index 6fbd8d0e..d3b8bcc5 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -261,7 +261,9 @@ static void test_stress(void) {
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) {
       printf("- iterations left: %3d\n", ITER - (n + 1));
+      #ifndef USE_STD_MALLOC
       mi_debug_show_arenas();
+      #endif
       //mi_collect(true);
       //mi_debug_show_arenas();
     }

From 89d629317f986d2ef7605ced9fa5ec011adc1594 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 10 Feb 2025 12:45:38 -0800
Subject: [PATCH 246/264] limit page_reclaim to page queues of less than 4
 pages; make page_commit_on_demand 0 by default.

---
 src/free.c    | 10 +++++++++-
 src/options.c |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/free.c b/src/free.c
index 1df10728..9ca71499 100644
--- a/src/free.c
+++ b/src/free.c
@@ -202,6 +202,14 @@ void mi_free(void* p) mi_attr_noexcept
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free);
+static bool inline mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) {  
+  mi_page_queue_t* const pq = mi_page_queue(heap,block_size);
+  mi_assert_internal(pq!=NULL);
+  for(mi_page_t* p = pq->first; p!=NULL; p = p->next, atmost--) {
+    if (atmost == 0) { return false; }
+  }
+  return true;
+}
 
 static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
   mi_assert_internal(mi_page_is_owned(page));
@@ -243,7 +251,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
     }
     // can we reclaim?
     if (heap != NULL && heap->allow_page_reclaim) {
-      if (heap == page->heap ||                  // only reclaim if we were the originating heap,
+      if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, 4)) ||  // only reclaim if we were the originating heap, and we have at most N pages already
           (reclaim_on_free == 1 &&               // OR if the reclaim across heaps is allowed
            !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
            !heap->tld->is_in_threadpool &&       //    and not part of a threadpool
diff --git a/src/options.c b/src/options.c
index a61c2dc2..d1bdd716 100644
--- a/src/options.c
+++ b/src/options.c
@@ -172,9 +172,9 @@ static mi_option_desc_t options[_mi_option_last] =
   { 2,   UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
   { 4,   UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
-  { MI_DEFAULT_PAGEMAP_COMMIT,
+  { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From c820259f3b886062b29ff607dac00226eb3c93e3 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 10 Feb 2025 21:25:30 -0800
Subject: [PATCH 247/264] fix heap_main declaration

---
 src/init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/init.c b/src/init.c
index 2cd77cb2..5bedab85 100644
--- a/src/init.c
+++ b/src/init.c
@@ -138,7 +138,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   MI_MEMID_STATIC
 };
 
-static mi_decl_cache_align mi_heap_t heap_main;
+extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main;
 
 static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_id
@@ -153,7 +153,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
   MI_MEMID_STATIC         // memid
 };
 
-static mi_decl_cache_align mi_heap_t heap_main = {
+mi_decl_cache_align mi_heap_t heap_main = {
   &tld_main,              // thread local data
   NULL,                   // exclusive arena
   0,                      // initial cookie

From 69a5fbb1f3f9fdd9361d8a33677d5573c7db5f72 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 11 Feb 2025 09:32:13 -0800
Subject: [PATCH 248/264] avoid overflow in max address calculation on 32-bit
 (issue #1010)

---
 src/page-map.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/page-map.c b/src/page-map.c
index 74c22e90..44d6de4a 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -40,7 +40,7 @@ bool _mi_page_map_init(void) {
   }
 
   // Allocate the page map and commit bits
-  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
   const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
   const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
   const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
@@ -183,7 +183,7 @@ bool _mi_page_map_init(void) {
 
   // Allocate the page map and commit bits
   mi_assert(MI_MAX_VABITS >= vbits);
-  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
   const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
   mi_assert(page_map_count <= MI_PAGE_MAP_COUNT);
   const size_t os_page_size = _mi_os_page_size();

From 63b8f8f753dae22b5179d639e78f047da07baed6 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 11 Feb 2025 09:47:03 -0800
Subject: [PATCH 249/264] fix assertion condition

---
 src/arena-meta.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arena-meta.c b/src/arena-meta.c
index ff50ea60..530e42cb 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -25,9 +25,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
 #define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
 
-#define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
+#define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE >= 4k (even on 32-bit)
 #define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
-#define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
+#define MI_META_BLOCKS_PER_PAGE   (MI_META_PAGE_SIZE / MI_META_BLOCK_SIZE)  // 512
 #define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
 
 typedef struct mi_meta_page_s  {
@@ -150,7 +150,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     const size_t block_idx   = memid.mem.meta.block_index;
     mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
     mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
-    mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(block_idx + block_count <= MI_META_BLOCKS_PER_PAGE);
     mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
     // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);

From 44a4c83fbfda403ae25dd436fed4adf3197a62b3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Tue, 11 Feb 2025 13:56:58 -0800
Subject: [PATCH 250/264] maintain count in pagequeue for constant time test in
 free.c

---
 include/mimalloc/internal.h |  1 +
 include/mimalloc/types.h    |  1 +
 src/free.c                  |  5 ++++-
 src/heap.c                  |  4 ++++
 src/init.c                  |  2 +-
 src/page-queue.c            | 37 +++++++++++++++++++++++++++++++++++--
 6 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c9f69a26..b45f7565 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -213,6 +213,7 @@ void          _mi_deferred_free(mi_heap_t* heap, bool force);
 void          _mi_page_free_collect(mi_page_t* page, bool force);
 void          _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head);
 void          _mi_page_init(mi_heap_t* heap, mi_page_t* page);
+bool          _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq);
 
 size_t        _mi_bin_size(uint8_t bin); // for stats
 uint8_t       _mi_bin(size_t size);      // for stats
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 5059ecd1..a743546e 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -389,6 +389,7 @@ typedef struct mi_tld_s mi_tld_t;
 typedef struct mi_page_queue_s {
   mi_page_t* first;
   mi_page_t* last;
+  size_t     count;
   size_t     block_size;
 } mi_page_queue_t;
 
diff --git a/src/free.c b/src/free.c
index 9ca71499..418acd02 100644
--- a/src/free.c
+++ b/src/free.c
@@ -202,13 +202,16 @@ void mi_free(void* p) mi_attr_noexcept
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free);
-static bool inline mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) {  
+static inline bool mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) {  
   mi_page_queue_t* const pq = mi_page_queue(heap,block_size);
   mi_assert_internal(pq!=NULL);
+  return (pq->count <= atmost);
+  /*
   for(mi_page_t* p = pq->first; p!=NULL; p = p->next, atmost--) {
     if (atmost == 0) { return false; }
   }
   return true;
+  */
 }
 
 static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
diff --git a/src/heap.c b/src/heap.c
index daad8afc..116d0589 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -63,6 +63,9 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 static bool mi_heap_is_valid(mi_heap_t* heap) {
   mi_assert_internal(heap!=NULL);
   mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  for (int bin = 0; bin < MI_BIN_COUNT; bin++) {
+    mi_assert_internal(_mi_page_queue_is_valid(heap, &heap->pages[bin]));
+  }
   return true;
 }
 #endif
@@ -106,6 +109,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  mi_assert_expensive(mi_heap_is_valid(heap));
 
   const bool force = (collect >= MI_FORCE);
   _mi_deferred_free(heap, force);
diff --git a/src/init.c b/src/init.c
index 5bedab85..4cac1c18 100644
--- a/src/init.c
+++ b/src/init.c
@@ -50,7 +50,7 @@ const mi_page_t _mi_page_empty = {
 
 
 // Empty page queues for every bin
-#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define QNULL(sz)  { NULL, NULL, 0, (sz)*sizeof(uintptr_t) }
 #define MI_PAGE_QUEUES_EMPTY \
   { QNULL(1), \
     QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
diff --git a/src/page-queue.c b/src/page-queue.c
index 9e3aaacc..5365c0b7 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -49,6 +49,10 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
   return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
 }
 
+static inline size_t mi_page_queue_count(const mi_page_queue_t* pq) {
+  return pq->count;
+}
+
 /* -----------------------------------------------------------
   Bins
 ----------------------------------------------------------- */
@@ -142,6 +146,25 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
+bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  if (pq==NULL) return false;
+  size_t count = 0;
+  mi_page_t* prev = NULL;
+  for (mi_page_t* page = pq->first; page != NULL; page = page->next) {
+    mi_assert_internal(page->prev == prev);
+    mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    mi_assert_internal(page->heap == heap);
+    if (page->next == NULL) {
+      mi_assert_internal(pq->last == page);
+    }
+    count++;
+    prev = page;
+  }
+  mi_assert_internal(pq->count == count);
+  return true;
+}
+
+
 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
   mi_assert_internal(heap!=NULL);
   uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
@@ -211,6 +234,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(queue->count >= 1);
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
                       (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
@@ -225,6 +249,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
     mi_heap_queue_first_update(heap,queue);
   }
   heap->page_count--;
+  queue->count--;
   page->next = NULL;
   page->prev = NULL;
   mi_page_set_in_full(page,false);
@@ -253,6 +278,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   else {
     queue->first = queue->last = page;
   }
+  queue->count++;
 
   // update direct
   mi_heap_queue_first_update(heap, queue);
@@ -279,6 +305,7 @@ static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, m
   else {
     queue->first = queue->last = page;
   }
+  queue->count++;
 
   // update direct
   if (queue->first == page) {
@@ -298,6 +325,7 @@ static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue,
 
 static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) {
   mi_assert_internal(page != NULL);
+  mi_assert_internal(from->count >= 1);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
   const size_t bsize = mi_page_block_size(page);
@@ -320,8 +348,10 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
     mi_assert_internal(mi_heap_contains_queue(heap, from));
     mi_heap_queue_first_update(heap, from);
   }
+  from->count--;
 
   // insert into `to`
+  to->count++;
   if (enqueue_at_end) {
     // enqueue at the end
     page->prev = to->last;
@@ -378,15 +408,16 @@ static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t
 size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
   mi_assert_internal(mi_heap_contains_queue(heap,pq));
   mi_assert_internal(pq->block_size == append->block_size);
-
+  
   if (append->first==NULL) return 0;
-
+  
   // set append pages to new heap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
     mi_page_set_heap(page, heap);
     count++;
   }
+  mi_assert_internal(count == append->count);
 
   if (pq->last==NULL) {
     // take over afresh
@@ -403,5 +434,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
     append->first->prev = pq->last;
     pq->last = append->last;
   }
+  pq->count += append->count;
+
   return count;
 }

From 0cbdcfac94780061af20b3c39e9f21ab41ddd400 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 11 Feb 2025 16:07:35 -0800
Subject: [PATCH 251/264] fix signed warning

---
 src/heap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heap.c b/src/heap.c
index 57bb2f52..ac67698a 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -63,7 +63,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 static bool mi_heap_is_valid(mi_heap_t* heap) {
   mi_assert_internal(heap!=NULL);
   mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
-  for (int bin = 0; bin < MI_BIN_COUNT; bin++) {
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
     mi_assert_internal(_mi_page_queue_is_valid(heap, &heap->pages[bin]));
   }
   return true;

From cd2763aa3dbea905231798cec23c1ba0eaa1f7f7 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 11 Feb 2025 16:27:25 -0800
Subject: [PATCH 252/264] fix compile warnings and assertion

---
 src/page-queue.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/page-queue.c b/src/page-queue.c
index 1ffbbf2a..6e8b0853 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -141,12 +141,21 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif
 
 bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  MI_UNUSED_RELEASE(heap);
   if (pq==NULL) return false;
-  size_t count = 0;
-  mi_page_t* prev = NULL;
+  size_t count = 0; MI_UNUSED_RELEASE(count);
+  mi_page_t* prev = NULL; MI_UNUSED_RELEASE(prev);
   for (mi_page_t* page = pq->first; page != NULL; page = page->next) {
     mi_assert_internal(page->prev == prev);
-    mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    if (mi_page_is_in_full(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 2); 
+    }
+    else if (mi_page_is_huge(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 1); 
+    }
+    else {
+      mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    }
     mi_assert_internal(page->heap == heap);
     if (page->next == NULL) {
       mi_assert_internal(pq->last == page);

From 2775be9bed98f20a8856bd2dd0b893ea41eacc0f Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 11 Feb 2025 16:28:08 -0800
Subject: [PATCH 253/264] disable page commit_on_demand by default

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index 485beb48..d1bdd716 100644
--- a/src/options.c
+++ b/src/options.c
@@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 62848bd0722121da50ea6181a0c685c45518394f Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 17 Feb 2025 14:19:56 -0800
Subject: [PATCH 254/264] remove -mtune flag for now (issue #1010)

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bb15e8c..c9217001 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -437,15 +437,15 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
     if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
       set(MI_OPT_ARCH_FLAGS "")
       if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
-        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a;-mtune=native")
+        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
       endif()
       if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
         list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_x86_64;-march=haswell;-Xarch_x86_64;-mavx2")
       endif()
     elseif(MI_ARCH STREQUAL "x64")
-      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native")    # fast bit scan (since 2013)
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2")    # fast bit scan (since 2013)
     elseif(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native")         # fast atomics (since 2016)
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics (since 2016)
     endif()
   endif()
 endif()

From f8857a5189bd95d4db427b884c78fab14b3e18fb Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 18 Feb 2025 06:33:55 -0800
Subject: [PATCH 255/264] fix mi_bsr to not use lzcnt directly

---
 include/mimalloc/bits.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index d4632441..89ec7296 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -231,9 +231,8 @@ static inline size_t mi_clz(size_t x) {
   #elif mi_has_builtinz(clz)
     return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
   #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
-    if (x==0) return MI_SIZE_BITS;
-    size_t r;
-    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    size_t r = MI_SIZE_BITS; // bsr leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
+    __asm ("bsr\t%1, %0" : "+r"(r) : "r"(x) : "cc");
     return (MI_SIZE_BITS - 1 - r);
   #else
     #define MI_HAS_FAST_BITSCAN  0
@@ -270,12 +269,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsr(size_t x, size_t* idx) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
-    // on x64 the carry flag is set on zero which gives better codegen
-    bool is_zero;
-    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
-    return !is_zero;
-  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #if 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
     return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else

From dce6ec8b41711621f017b9219bafb601dd04e3aa Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 18 Feb 2025 06:45:12 -0800
Subject: [PATCH 256/264] fix find_highest_bit

---
 src/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 1c28fe44..3907e91d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -49,7 +49,7 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
-  return mi_bsf(x, idx);
+  return mi_bsr(x, idx);
 }
 
 

From 7e611f7545a70b4db7c561f04688ce483bf6a37b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 18 Feb 2025 16:04:50 -0800
Subject: [PATCH 257/264] merge from dev

---
 src/prim/unix/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 1c33288e..04d931d7 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -57,7 +57,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if defined(__linux__) || defined(__FreeBSD__)
+#if (defined(__linux__) && !defined(__ANDROID__)) || defined(__FreeBSD__)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif

From a7f11cd2b03c45c71f21e7ce493c6e81bb07e644 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 20 Feb 2025 14:39:35 -0800
Subject: [PATCH 258/264] define mi_clz/ctz in portable way on x64 that does
 not require BMI1 (issue #1016)

---
 include/mimalloc/bits.h | 43 ++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 89ec7296..9b1d75f7 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -195,19 +195,24 @@ size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
 
 static inline size_t mi_ctz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
-    size_t r;
-    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+  #if defined(__GNUC__) && MI_ARCH_X64
+    // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell)
+    // tzcnt sets carry-flag on zero, while bsf sets the zero-flag
+    // tzcnt sets the result to MI_SIZE_BITS if the argument 0 
+    // bsf leaves destination _unmodified_ if the argument is 0 (both AMD and Intel now, see <https://github.com/llvm/llvm-project/pull/102885>)
+    // so we always initialize r to MI_SIZE_BITS to work correctly on all cpu's without branching
+    size_t r = MI_SIZE_BITS;
+    __asm ("tzcnt\t%1, %0" : "+r"(r) : "r"(x) : "cc");  // use '+r' to keep the assignment to r in case this becomes bsf on older cpu's
     return r;
-  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
-    return _tzcnt_u64(x);
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
+    return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS);  // ensure it still works on older cpu's as well
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
     return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
-  #elif mi_has_builtinz(ctz)
-    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
-  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
-    size_t r = MI_SIZE_BITS;  // bsf leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
+  #elif defined(__GNUC__) && MI_ARCH_X86
+    size_t r = MI_SIZE_BITS;
     __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc");
     return r;
   #elif MI_HAS_FAST_POPCOUNT
@@ -219,20 +224,21 @@ static inline size_t mi_ctz(size_t x) {
 }
 
 static inline size_t mi_clz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+  // we don't optimize to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016)
+  // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position)
+  #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     size_t r;
     __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
-  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
-    return _lzcnt_u64(x);
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
     return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
-  #elif mi_has_builtinz(clz)
-    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
   #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
-    size_t r = MI_SIZE_BITS; // bsr leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
-    __asm ("bsr\t%1, %0" : "+r"(r) : "r"(x) : "cc");
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return (MI_SIZE_BITS - 1 - r);
   #else
     #define MI_HAS_FAST_BITSCAN  0
@@ -252,12 +258,13 @@ static inline size_t mi_clz(size_t x) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsf(size_t x, size_t* idx) {
-  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+  // see note in `mi_ctz` 
+  #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
     // on x64 the carry flag is set on zero which gives better codegen
     bool is_zero;
     __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
     return !is_zero;
-  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
     return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else

From b59c1f8ce4ba7d88a1782118cfd5ac6d7dbee876 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 24 Feb 2025 13:48:21 -0800
Subject: [PATCH 259/264] update comments

---
 include/mimalloc/bits.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 9b1d75f7..335fbab7 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -197,9 +197,9 @@ size_t _mi_ctz_generic(size_t x);
 static inline size_t mi_ctz(size_t x) {
   #if defined(__GNUC__) && MI_ARCH_X64
     // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell)
-    // tzcnt sets carry-flag on zero, while bsf sets the zero-flag
-    // tzcnt sets the result to MI_SIZE_BITS if the argument 0 
-    // bsf leaves destination _unmodified_ if the argument is 0 (both AMD and Intel now, see <https://github.com/llvm/llvm-project/pull/102885>)
+    // if the argument is zero:
+    // - tzcnt: sets carry-flag, and returns MI_SIZE_BITS
+    // - bsf  : sets zero-flag, and leaves the destination _unmodified_ (on both AMD and Intel now, see <https://github.com/llvm/llvm-project/pull/102885>)
     // so we always initialize r to MI_SIZE_BITS to work correctly on all cpu's without branching
     size_t r = MI_SIZE_BITS;
     __asm ("tzcnt\t%1, %0" : "+r"(r) : "r"(x) : "cc");  // use '+r' to keep the assignment to r in case this becomes bsf on older cpu's
@@ -207,7 +207,7 @@ static inline size_t mi_ctz(size_t x) {
   #elif mi_has_builtinz(ctz)
     return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
   #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
-    return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS);  // ensure it still works on older cpu's as well
+    return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS);  // ensure it still works on non-BMI1 cpu's as well
   #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long idx;
     return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
@@ -224,7 +224,7 @@ static inline size_t mi_ctz(size_t x) {
 }
 
 static inline size_t mi_clz(size_t x) {
-  // we don't optimize to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016)
+  // we don't optimize anymore to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016)
   // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position)
   #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     size_t r;
@@ -258,7 +258,7 @@ static inline size_t mi_clz(size_t x) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsf(size_t x, size_t* idx) {
-  // see note in `mi_ctz` 
+  // we don't optimize anymore to lzcnt so we run correctly on older cpu's as well
   #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
     // on x64 the carry flag is set on zero which gives better codegen
     bool is_zero;

From c3fc75e0ff27d72880906a2d48d5f21f6a195402 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 28 Feb 2025 16:26:45 -0800
Subject: [PATCH 260/264] update clz/ctz for BMI1

---
 include/mimalloc/bits.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 335fbab7..fc56e8ea 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -195,7 +195,11 @@ size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
 
 static inline size_t mi_ctz(size_t x) {
-  #if defined(__GNUC__) && MI_ARCH_X64
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_X64
     // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell)
     // if the argument is zero:
     // - tzcnt: sets carry-flag, and returns MI_SIZE_BITS
@@ -226,7 +230,7 @@ static inline size_t mi_ctz(size_t x) {
 static inline size_t mi_clz(size_t x) {
   // we don't optimize anymore to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016)
   // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position)
-  #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
     size_t r;
     __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
     return r;
@@ -259,7 +263,7 @@ static inline size_t mi_clz(size_t x) {
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsf(size_t x, size_t* idx) {
   // we don't optimize anymore to lzcnt so we run correctly on older cpu's as well
-  #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
     // on x64 the carry flag is set on zero which gives better codegen
     bool is_zero;
     __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
@@ -276,7 +280,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsr(size_t x, size_t* idx) {
-  #if 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
     unsigned long i;
     return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else

From 6fce7b90a477b14605f3e301fe2e2adcc009b6c8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 28 Feb 2025 16:51:59 -0800
Subject: [PATCH 261/264] reduce object class sizes (/8), add max reclaim queue
 size

---
 include/mimalloc.h       |  1 +
 include/mimalloc/types.h |  4 ++--
 src/free.c               | 51 +++++++++++++++++++++-------------------
 src/options.c            |  3 ++-
 4 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index b14cba52..1a544b6f 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -404,6 +404,7 @@ typedef enum mi_option_e {
   mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
   mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
   mi_option_page_commit_on_demand,      // commit page memory on-demand
+  mi_option_page_reclaim_max,           // don't reclaim pages if we already own N pages (in that size class) (=16)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a743546e..ba3c43fa 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -339,9 +339,9 @@ typedef struct mi_page_s {
 #endif
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // < 16 KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < ~8 KiB
 #if MI_ENABLE_LARGE_PAGES
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #else
 #define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/4)   // <= 128 KiB
diff --git a/src/free.c b/src/free.c
index 418acd02..12bb8e26 100644
--- a/src/free.c
+++ b/src/free.c
@@ -239,33 +239,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
 
   // 2. we can try to reclaim the page for ourselves
   // note:  we only reclaim if the page originated from our heap (the heap field is preserved on abandonment)
-  // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson`
-  const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
-  if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE)       // only for small sized blocks
+  // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson`  
+  if (page->block_size <= MI_SMALL_MAX_OBJ_SIZE)       // only for small sized blocks
   {
-    // get our heap (with the right tag)
-    // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
-    // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-    mi_heap_t* heap = mi_prim_get_default_heap();
-    if (heap != page->heap) {                     
-      if (mi_heap_is_initialized(heap)) {               
-        heap = _mi_heap_by_tag(heap, page->heap_tag);
+    const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
+    if (reclaim_on_free >= 0) {                        // and reclaiming is allowed
+      // get our heap (with the right tag)
+      // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
+      // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+      mi_heap_t* heap = mi_prim_get_default_heap();
+      if (heap != page->heap) {
+        if (mi_heap_is_initialized(heap)) {
+          heap = _mi_heap_by_tag(heap, page->heap_tag);
+        }
       }
-    }
-    // can we reclaim?
-    if (heap != NULL && heap->allow_page_reclaim) {
-      if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, 4)) ||  // only reclaim if we were the originating heap, and we have at most N pages already
+      // can we reclaim into this heap?
+      if (heap != NULL && heap->allow_page_reclaim) {
+        const long reclaim_max = _mi_option_get_fast(mi_option_page_reclaim_max);
+        if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, reclaim_max)) ||  // only reclaim if we were the originating heap, and we have at most N pages already
           (reclaim_on_free == 1 &&               // OR if the reclaim across heaps is allowed
-           !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
-           !heap->tld->is_in_threadpool &&       //    and not part of a threadpool
-           _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena))  // and the memory is suitable    
-         )
-      {
-        // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-        _mi_arenas_page_unabandon(page);
-        _mi_heap_page_reclaim(heap, page);
-        mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
-        return;
+            !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
+            !heap->tld->is_in_threadpool &&       //    and not part of a threadpool
+            _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena))  // and the memory is suitable    
+          )
+        {
+          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+          _mi_arenas_page_unabandon(page);
+          _mi_heap_page_reclaim(heap, page);
+          mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
+          return;
+        }
       }
     }
   }
diff --git a/src/options.c b/src/options.c
index d1bdd716..b8028afe 100644
--- a/src/options.c
+++ b/src/options.c
@@ -174,7 +174,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
   { MI_DEFAULT_PAGEMAP_COMMIT, 
          UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this on overcommit systems (like Linux))
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
+  { 16,  UNINIT, MI_OPTION(page_reclaim_max) },         // don't reclaim pages if we already own N pages (in that size class) 
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 632fe6d8c8f87c54b0b37869fd3db1d41425b38b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 3 Mar 2025 17:19:20 -0800
Subject: [PATCH 262/264] add MI_WIN_DBG_EXTS option for cmake

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c47671d..52844552 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,8 @@ option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead
 option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
 option(MI_EXTRA_CPPDEFS     "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "")
 
+option(MI_WIN_DBG_EXTS      "Build with windows debugger extension points")
+
 # negated options for vcpkg features
 option(MI_NO_USE_CXX        "Use plain C compilation (has priority over MI_USE_CXX)" OFF)
 option(MI_NO_OPT_ARCH       "Do not use architecture specific optimizations (like '-march=armv8.1-a' for example) (has priority over MI_OPT_ARCH)" OFF)
@@ -512,6 +514,9 @@ endfunction()
 
 if(WIN32)
   list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
+  if(MI_WIN_DBG_EXTS)
+    list(APPEND mi_libraries dbgeng) # todo: only for the dll?
+  endif()
 else()
   find_link_library("pthread" MI_LIB_PTHREAD)
   if(MI_LIB_PTHREAD)

From a6302f47680193b8c8e3afba571bcea87f0775e4 Mon Sep 17 00:00:00 2001
From: Gustavo Varo <gustavovaro@microsoft.com>
Date: Tue, 4 Mar 2025 08:57:24 -0500
Subject: [PATCH 263/264] Add barebones of MiMalloc WinDbg extension

---
 CMakeLists.txt                                |  15 ++
 ide/vs2022/mimalloc-lib.vcxproj               |   5 +-
 ide/vs2022/mimalloc-lib.vcxproj.filters       |   3 +
 ide/vs2022/mimalloc-override-dll.vcxproj      |   4 +-
 .../mimalloc-override-dll.vcxproj.filters     |   3 +
 src/prim/windows/windbg/mimalloc_dbg.cpp      | 146 ++++++++++++++++++
 6 files changed, 173 insertions(+), 3 deletions(-)
 create mode 100644 src/prim/windows/windbg/mimalloc_dbg.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52844552..2dba9ba1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,10 @@ set(mi_sources
     src/stats.c
     src/prim/prim.c)
 
+if(WIN32 AND MI_WIN_DBG_EXTS)
+  list(APPEND mi_sources src/prim/windows/windbg/mimalloc_dbg.cpp)
+endif()
+
 set(mi_cflags "")
 set(mi_cflags_static "")            # extra flags for a static library build
 set(mi_cflags_dynamic "")           # extra flags for a shared-object library build
@@ -255,6 +259,17 @@ if(MI_TRACK_ETW)
   endif()
 endif()
 
+if(MI_WIN_DBG_EXTS)
+  if(NOT WIN32)
+    set(MI_WIN_DBG_EXTS OFF)
+    message(WARNING "Can only enable Windows debbuger extension support on Windows (MI_WIN_DBG_EXTS=OFF)")
+  endif()
+  if(MI_WIN_DBG_EXTS)
+    message(STATUS "Compile with Windows debbuger extension support (MI_WIN_DBG_EXTS=ON)")
+    list(APPEND mi_defines MI_WIN_DBG_EXTS=1)
+  endif()
+endif()
+
 if(MI_GUARDED)
   message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)")
   list(APPEND mi_defines MI_GUARDED=1)
diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj
index b4bf013e..b5bc9677 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@@ -211,8 +211,7 @@
     <Lib>
       <AdditionalLibraryDirectories>
       </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
+      <AdditionalDependencies>dbgeng.lib</AdditionalDependencies>
     </Lib>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
@@ -477,6 +476,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp" />
     <ClCompile Include="..\..\src\random.c" />
     <ClCompile Include="..\..\src\os.c" />
     <ClCompile Include="..\..\src\stats.c" />
@@ -493,6 +493,7 @@
     <ClInclude Include="..\..\include\mimalloc\track.h" />
     <ClInclude Include="..\..\include\mimalloc\types.h" />
     <ClInclude Include="..\..\src\bitmap.h" />
+    <ClInclude Include="..\..\src\prim\windows\windbg\mimalloc_dbg.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/ide/vs2022/mimalloc-lib.vcxproj.filters b/ide/vs2022/mimalloc-lib.vcxproj.filters
index 6825f113..c2d7db7b 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj.filters
+++ b/ide/vs2022/mimalloc-lib.vcxproj.filters
@@ -61,6 +61,9 @@
     <ClCompile Include="..\..\src\arena-meta.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\mimalloc\atomic.h">
diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj b/ide/vs2022/mimalloc-override-dll.vcxproj
index 556d7926..3904e344 100644
--- a/ide/vs2022/mimalloc-override-dll.vcxproj
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj
@@ -208,7 +208,7 @@
       <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;dbgeng.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>
       </IgnoreSpecificDefaultLibraries>
       <ModuleDefinitionFile>
@@ -441,6 +441,7 @@
     <ClInclude Include="..\..\include\mimalloc\track.h" />
     <ClInclude Include="..\..\include\mimalloc\types.h" />
     <ClInclude Include="..\..\src\bitmap.h" />
+    <ClInclude Include="..\..\src\prim\windows\windbg\mimalloc_dbg.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\alloc-aligned.c">
@@ -506,6 +507,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp" />
     <ClCompile Include="..\..\src\random.c" />
     <ClCompile Include="..\..\src\stats.c" />
   </ItemGroup>
diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj.filters b/ide/vs2022/mimalloc-override-dll.vcxproj.filters
index ebcf545a..e09e6bb1 100644
--- a/ide/vs2022/mimalloc-override-dll.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj.filters
@@ -61,6 +61,9 @@
     <ClCompile Include="..\..\src\arena-meta.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\mimalloc\atomic.h">
diff --git a/src/prim/windows/windbg/mimalloc_dbg.cpp b/src/prim/windows/windbg/mimalloc_dbg.cpp
new file mode 100644
index 00000000..5bc76372
--- /dev/null
+++ b/src/prim/windows/windbg/mimalloc_dbg.cpp
@@ -0,0 +1,146 @@
+﻿/* ----------------------------------------------------------------------------
+Copyright (c) Microsoft Research
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include <atomic>
+#include <DbgEng.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <Windows.h>
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+
+ULONG64 g_MiMallocBase = 0;
+IDebugClient* g_DebugClient = nullptr;
+IDebugControl* g_DebugControl = nullptr;
+IDebugSymbols3* g_DebugSymbols = nullptr;
+IDebugDataSpaces* g_DataSpaces = nullptr;
+
+// Function to find mimalloc.dll base address at startup
+HRESULT FindMimallocBase()
+{
+    if (g_DebugSymbols == nullptr)
+    {
+        return E_FAIL;
+    }
+
+    return g_DebugSymbols->GetModuleByModuleName("mimalloc", 0, NULL, &g_MiMallocBase);
+}
+
+// Entry point for the extension
+extern "C" __declspec(dllexport) HRESULT CALLBACK DebugExtensionInitialize(PULONG version, PULONG flags)
+{
+    UNREFERENCED_PARAMETER(flags);
+
+    // Ensure Version is valid
+    if (!version)
+    {
+        return E_INVALIDARG;
+    }
+
+    // Set the version
+    *version = DEBUG_EXTENSION_VERSION(1, 0);
+
+    HRESULT hr = DebugCreate(__uuidof(IDebugClient), (void**)&g_DebugClient);
+    if (FAILED(hr))
+    {
+        return hr;
+    }
+
+    // Query for the IDebugControl interface
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugControl), (void**)&g_DebugControl);
+    if (FAILED(hr))
+    {
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugSymbols3), (void**)&g_DebugSymbols);
+    if (FAILED(hr))
+    {
+        g_DebugControl->Release();
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugDataSpaces), (void**)&g_DataSpaces);
+    if (FAILED(hr))
+    {
+        g_DebugSymbols->Release();
+        g_DebugControl->Release();
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    // Find mimalloc base address at startup
+    hr = FindMimallocBase();
+    if (FAILED(hr) || g_MiMallocBase == 0)
+    {
+        return E_FAIL;  // Prevent extension from loading
+    }
+
+    mi_register_output(
+        [](const char* msg, void* arg) {
+            g_DebugControl->Output(DEBUG_OUTPUT_ERROR, msg);
+            g_DebugControl->Output(DEBUG_OUTPUT_ERROR, "\n");
+        },
+        nullptr);
+
+    g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "mimalloc.dll base address found: 0x%llx\n", g_MiMallocBase);
+
+    return S_OK;
+}
+
+// Notifies the extension that a debug event has occurred
+extern "C" __declspec(dllexport) void CALLBACK DebugExtensionNotify(ULONG notify, ULONG64 argument)
+{
+    UNREFERENCED_PARAMETER(notify);
+    UNREFERENCED_PARAMETER(argument);
+}
+
+// Uninitializes the extension
+extern "C" __declspec(dllexport) void CALLBACK DebugExtensionUninitialize()
+{
+    if (g_DebugSymbols)
+    {
+        g_DebugSymbols->Release();
+        g_DebugSymbols = nullptr;
+    }
+
+    if (g_DebugControl)
+    {
+        g_DebugControl->Release();
+        g_DebugControl = nullptr;
+    }
+
+    if (g_DebugClient)
+    {
+        g_DebugClient->Release();
+        g_DebugClient = nullptr;
+    }
+}
+
+// Sample command: !mi_help
+extern "C" __declspec(dllexport) HRESULT CALLBACK mi_help(PDEBUG_CLIENT Client, PCSTR args)
+{
+    UNREFERENCED_PARAMETER(args);
+
+    // Print Help
+    g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "Hello from MiMalloc WinDbg Extension!\n");
+
+    return S_OK;
+}
+
+extern "C" __declspec(dllexport) HRESULT CALLBACK mi_dump_arenas(PDEBUG_CLIENT client, PCSTR args)
+{
+    mi_debug_show_arenas();
+    return S_OK;
+}
\ No newline at end of file

From fcc76cb95cb2d7fbd7644d41dbe0d3d71333c4cb Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Tue, 4 Mar 2025 11:37:41 -0800
Subject: [PATCH 264/264] fix options printing when verbose is off

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index bf6cf437..864ef273 100644
--- a/src/options.c
+++ b/src/options.c
@@ -202,7 +202,7 @@ void _mi_options_init(void) {
     }
   }
   #endif
-  if (!mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); }
+  if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); }
 }
 
 #define mi_stringifyx(str)  #str                // and stringify