Merge branch 'dev' into dev3

bump version to 3.0.0
merge from dev
2025-05-08 00:09:31 +03:00 · 2025-01-03 18:11:11 -08:00 · 2025-01-03 18:08:34 -08:00 · 2025-01-03 18:07:01 -08:00 · 2025-01-03 14:27:18 -08:00 · 2025-01-03 14:26:44 -08:00
44 changed files with 6707 additions and 5012 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,25 +10,30 @@ option(MI_PADDING           "Enable padding to detect heap block overflow (alway
 option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
-option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
 option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
-option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON)
+
 option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
 option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
 option(MI_WIN_USE_FIXED_TLS "Use a fixed TLS slot on Windows to avoid extra tests in the malloc fast path" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
-option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
+option(MI_LIBC_MUSL         "Enable this when linking with musl libc" OFF)
 option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
 option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
 option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
 option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
 option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+
 option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
 option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
@ -50,6 +55,7 @@ set(mi_sources
    src/alloc-aligned.c
    src/alloc-posix.c
    src/arena.c
    src/arena-meta.c
    src/bitmap.c
    src/heap.c
    src/init.c
@ -57,9 +63,8 @@ set(mi_sources
    src/options.c
    src/os.c
    src/page.c
    src/page-map.c
    src/random.c
    src/segment.c
    src/segment-map.c
    src/stats.c
    src/prim/prim.c)
@ -122,8 +127,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
  if (NOT MI_OPT_ARCH)
    message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
  endif()
-else()
+#else()
-  set(MI_OPT_ARCH OFF)
+#  set(MI_OPT_ARCH OFF)
 endif()
 if(MI_OVERRIDE)
@ -227,7 +232,7 @@ endif()
 if(MI_SEE_ASM)
  message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
  list(APPEND mi_cflags -save-temps)
-  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14)
    message(STATUS "No GNU Line marker")
    list(APPEND mi_cflags -Wno-gnu-line-marker)
  endif()
@ -398,21 +403,28 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
      list(APPEND mi_cflags -ftls-model=initial-exec)
    endif()
  endif()
 endif()
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
  if(MI_OVERRIDE)
    list(APPEND mi_cflags -fno-builtin-malloc)
  endif()
  if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
+    if(MI_ARCH STREQUAL "x64")
-      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native")    # fast bit scan (since 2013)
    elseif(MI_ARCH STREQUAL "arm64")
      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native")         # fast atomics (since 2016)
    endif()
  endif()
 endif()
-if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
  list(APPEND mi_cflags /Zc:__cplusplus)
  if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
+    if(MI_ARCH STREQUAL "x64")
-      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics
+      set(MI_OPT_ARCH_FLAGS "/arch:AVX2")
    elseif(MI_ARCH STREQUAL "arm64")
      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")
    endif()
  endif()
 endif()
@ -424,6 +436,12 @@ endif()
 if(MI_OPT_ARCH_FLAGS)
  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
  if (MI_OPT_SIMD)
    list(APPEND mi_defines "MI_OPT_SIMD=1")
    message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)")
  endif()
 elseif(MI_OPT_SIMD)
  message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)")
 endif()
 # extra needed libraries
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -306,3 +306,28 @@ jobs:
  - script: ctest --verbose --timeout 240
    workingDirectory: $(BuildType)
    displayName: CTest
 - job:
  displayName: macOS 13 (Ventura)
  pool:
    vmImage:
      macOS-13
  strategy:
    matrix:
      Debug:
        BuildType: debug
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
      Release:
        BuildType: release
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
  steps:
  - task: CMake@1
    inputs:
      workingDirectory: $(BuildType)
      cmakeArgs: .. $(cmakeExtraArgs)
  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
    displayName: Make
  - script: ctest --verbose --timeout 180
    workingDirectory: $(BuildType)
    displayName: CTest
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -1,6 +1,6 @@
-set(mi_version_major 1)
+set(mi_version_major 3)
-set(mi_version_minor 8)
+set(mi_version_minor 0)
-set(mi_version_patch 8)
+set(mi_version_patch 0)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 set(PACKAGE_VERSION ${mi_version})
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@ -431,12 +431,11 @@ int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large);
 /// @param start       Start of the memory area
 /// @param size        The size of the memory area.
 /// @param is_committed Is the area already committed?
-/// @param is_large    Does it consist of large OS pages? Set this to \a true as well for memory
+/// @param is_pinned   Can the memory not be decommitted or reset? (usually the case for large OS pages)
 ///                    that should not be decommitted or protected (like rdma etc.)
 /// @param is_zero     Does the area consists of zero's?
 /// @param numa_node   Possible associated numa node or `-1`.
 /// @return \a true if successful, and \a false on error.
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node);
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node);
 /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes,
 /// but stops after at most `timeout_msecs` seconds.
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@ -308,6 +308,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <LanguageStandard>stdcpp20</LanguageStandard>
      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -421,16 +422,7 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena-meta.c" />
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\bitmap.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
@ -450,6 +442,7 @@
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -474,8 +467,6 @@
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
    <ClCompile Include="..\..\src\random.c" />
    <ClCompile Include="..\..\src\segment-map.c" />
    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\os.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
@ -484,6 +475,7 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
--- a/ide/vs2022/mimalloc-lib.vcxproj.filters
+++ b/ide/vs2022/mimalloc-lib.vcxproj.filters
@ -16,9 +16,6 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\arena-abandoned.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\bitmap.c">
      <Filter>Sources</Filter>
    </ClCompile>
@ -55,15 +52,15 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\segment.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\segment-map.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\page-map.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\arena-meta.c">
      <Filter>Sources</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\include\mimalloc\atomic.h">
@ -93,6 +90,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc\bits.h">
      <Filter>Headers</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Headers">
--- a/ide/vs2022/mimalloc-override-dll.vcxproj
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj
@ -404,11 +404,10 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
    <ClInclude Include="..\..\include\mimalloc-etw-gen.h" />
    <ClInclude Include="..\..\include\mimalloc-etw.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
@ -438,7 +437,10 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena-meta.c" />
    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\free.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
@ -448,11 +450,10 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -478,13 +479,8 @@
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
    <ClCompile Include="..\..\src\random.c" />
    <ClCompile Include="..\..\src\segment-map.c" />
    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
  <ItemGroup>
    <None Include="..\..\include\mimalloc-etw-gen.man" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
--- a/ide/vs2022/mimalloc-override-dll.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj.filters
@ -16,9 +16,6 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\arena-abandoned.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\bitmap.c">
      <Filter>Sources</Filter>
    </ClCompile>
@ -52,15 +49,18 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\segment.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\segment-map.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\page-map.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\free.c">
      <Filter>Sources</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\arena-meta.c">
      <Filter>Sources</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\include\mimalloc\atomic.h">
@ -75,12 +75,6 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
      <Filter>Headers</Filter>
    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-etw.h">
      <Filter>Headers</Filter>
    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-etw-gen.h">
      <Filter>Headers</Filter>
    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
      <Filter>Headers</Filter>
    </ClInclude>
@ -96,6 +90,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc\bits.h">
      <Filter>Headers</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Headers">
@ -105,9 +102,4 @@
      <UniqueIdentifier>{94b40bdc-a741-45dd-81aa-c05fabcd2970}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\..\include\mimalloc-etw-gen.man">
      <Filter>Sources</Filter>
    </None>
  </ItemGroup>
 </Project>
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
-#define MI_MALLOC_VERSION 188   // major + 2 digits minor
+#define MI_MALLOC_VERSION 300   // major + 2 digits minor
 // ------------------------------------------------------
 // Compiler specific attributes
@ -274,16 +274,16 @@ mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
-mi_decl_export void mi_debug_show_arenas(bool show_inuse) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept;
 // Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
+typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 #if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
@ -317,6 +317,23 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max);
 // experimental
 //mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size);
 //mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size);
 //mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size);
 //mi_decl_export void  mi_os_free(void* p, size_t size);
 //mi_decl_export void  mi_os_commit(void* p, size_t size);
 //mi_decl_export void  mi_os_decommit(void* p, size_t size);
 mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
 mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id);
 mi_decl_export bool  mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);
 mi_decl_export void  mi_heap_unload(mi_heap_t* heap);
 // Is a pointer contained in the given arena area?
 mi_decl_export bool  mi_arena_contains(mi_arena_id_t arena_id, const void* p);
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -369,7 +386,6 @@ typedef enum mi_option_e {
  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
  mi_option_purge_extend_delay,
  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
  mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
@ -379,6 +395,12 @@ typedef enum mi_option_e {
  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
  mi_option_target_segments_per_thread, // experimental (=0)
  mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
  mi_option_page_full_retain,           // retain N full pages per size class (=2)
  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
  mi_option_page_commit_on_demand,      // commit page memory on-demand
  _mi_option_last,
  // legacy option names
  mi_option_large_os_pages = mi_option_allow_large_os_pages,
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#define MI_ATOMIC_H
 // include windows.h or pthreads.h
 #if defined(_WIN32)
@ -75,16 +75,21 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 #define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
@ -405,10 +410,9 @@ static inline void mi_atomic_yield(void) {
 // ----------------------------------------------------------------------
-// Locks 
+// Locks
-// These do not have to be recursive and should be light-weight 
+// These should be light-weight in-process only locks.
-// in-process only locks. Only used for reserving arena's and to 
+// Only used for reserving arena's and to maintain the abandoned list.
 // maintain the abandoned list.
 // ----------------------------------------------------------------------
 #if _MSC_VER
 #pragma warning(disable:26110)  // unlock with holding lock
@ -534,4 +538,4 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 #endif
-#endif // __MIMALLOC_ATOMIC_H
+#endif // MI_ATOMIC_H
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@ -0,0 +1,336 @@
 /* ----------------------------------------------------------------------------
 Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 /* ----------------------------------------------------------------------------
  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITS_H
 #define MI_BITS_H
 // ------------------------------------------------------
 // Size of a pointer.
 // We assume that `sizeof(void*)==sizeof(intptr_t)`
 // and it holds for all platforms we know of.
 //
 // However, the C standard only requires that:
 //  p == (void*)((intptr_t)p))
 // but we also need:
 //  i == (intptr_t)((void*)i)
 // or otherwise one might define an intptr_t type that is larger than a pointer...
 // ------------------------------------------------------
 #if INTPTR_MAX > INT64_MAX
 # define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
 #elif INTPTR_MAX == INT64_MAX
 # define MI_INTPTR_SHIFT (3)
 #elif INTPTR_MAX == INT32_MAX
 # define MI_INTPTR_SHIFT (2)
 #else
 #error platform pointers must be 32, 64, or 128 bits
 #endif
 #if (INTPTR_MAX) > LONG_MAX
 # define MI_PU(x)  x##ULL
 #else
 # define MI_PU(x)  x##UL
 #endif
 #if SIZE_MAX == UINT64_MAX
 # define MI_SIZE_SHIFT (3)
 typedef int64_t  mi_ssize_t;
 #elif SIZE_MAX == UINT32_MAX
 # define MI_SIZE_SHIFT (2)
 typedef int32_t  mi_ssize_t;
 #else
 #error platform objects must be 32 or 64 bits in size
 #endif
 #if (SIZE_MAX/2) > LONG_MAX
 # define MI_ZU(x)  x##ULL
 #else
 # define MI_ZU(x)  x##UL
 #endif
 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
 #define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
 #define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
 #define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
 #define MI_KiB     (MI_ZU(1024))
 #define MI_MiB     (MI_KiB*MI_KiB)
 #define MI_GiB     (MI_MiB*MI_KiB)
 /* --------------------------------------------------------------------------------
  Architecture
 -------------------------------------------------------------------------------- */
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)  // consider arm64ec as arm64
 #define MI_ARCH_ARM64     1
 #elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
 #define MI_ARCH_X64       1
 #elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
 #define MI_ARCH_X86       1
 #elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
 #define MI_ARCH_ARM32     1
 #elif defined(__riscv) || defined(_M_RISCV)
 #define MI_ARCH_RISCV     1
 #if (LONG_MAX == INT32_MAX)
 #define MI_ARCH_RISCV32   1
 #else
 #define MI_ARCH_RISCV64   1
 #endif
 #endif
 #if MI_ARCH_X64 && defined(__AVX2__)
 #include <immintrin.h>
 #elif MI_ARCH_ARM64 && MI_OPT_SIMD
 #include <arm_neon.h>
 #endif
 #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
 #include <intrin.h>
 #endif
 #if MI_ARCH_X64 && defined(__AVX2__) && !defined(__BMI2__) // msvc
 #define __BMI2__  1
 #endif
 #if MI_ARCH_X64 && (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
 #define __BMI1__  1
 #endif
 // Define big endian if needed
 // #define MI_BIG_ENDIAN  1
 // maximum virtual address bits in a user-space pointer
 #if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
 #define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
 #elif   MI_ARCH_X64
 #define MI_MAX_VABITS     (47)
 #elif MI_INTPTR_SIZE > 4
 #define MI_MAX_VABITS     (48)
 #else
 #define MI_MAX_VABITS     (32)
 #endif
 // use a flat page-map (or a 2-level one)
 #ifndef MI_PAGE_MAP_FLAT
 #if MI_MAX_VABITS <= 40 && !defined(__APPLE__) 
 #define MI_PAGE_MAP_FLAT  1
 #else
 #define MI_PAGE_MAP_FLAT  0
 #endif
 #endif
 /* --------------------------------------------------------------------------------
  Builtin's
 -------------------------------------------------------------------------------- */
 #ifndef __has_builtin
 #define __has_builtin(x)  0
 #endif
 #define mi_builtin(name)        __builtin_##name
 #define mi_has_builtin(name)    __has_builtin(__builtin_##name)
 #if (LONG_MAX == INT32_MAX)
 #define mi_builtin32(name)       mi_builtin(name##l)
 #define mi_has_builtin32(name)   mi_has_builtin(name##l)
 #else
 #define mi_builtin32(name)       mi_builtin(name)
 #define mi_has_builtin32(name)   mi_has_builtin(name)
 #endif
 #if (LONG_MAX == INT64_MAX)
 #define mi_builtin64(name)       mi_builtin(name##l)
 #define mi_has_builtin64(name)   mi_has_builtin(name##l)
 #else
 #define mi_builtin64(name)       mi_builtin(name##ll)
 #define mi_has_builtin64(name)   mi_has_builtin(name##ll)
 #endif
 #if (MI_SIZE_BITS == 32)
 #define mi_builtinz(name)        mi_builtin32(name)
 #define mi_has_builtinz(name)    mi_has_builtin32(name)
 #define mi_msc_builtinz(name)    name
 #elif (MI_SIZE_BITS == 64)
 #define mi_builtinz(name)        mi_builtin64(name)
 #define mi_has_builtinz(name)    mi_has_builtin64(name)
 #define mi_msc_builtinz(name)    name##64
 #endif
 /* --------------------------------------------------------------------------------
  Popcount and count trailing/leading zero's
 -------------------------------------------------------------------------------- */
 size_t _mi_popcount_generic(size_t x);
 static inline size_t mi_popcount(size_t x) {
  #if mi_has_builtinz(popcount)
    return mi_builtinz(popcount)(x);
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    return mi_msc_builtinz(__popcnt)(x);
  #elif MI_ARCH_X64 && defined(__BMI1__)
    return (size_t)_mm_popcnt_u64(x);
  #else
    #define MI_HAS_FAST_POPCOUNT  0
    return (x<=1 ? x : _mi_popcount_generic(x));
  #endif
 }
 #ifndef MI_HAS_FAST_POPCOUNT
 #define MI_HAS_FAST_POPCOUNT 1
 #endif
 size_t _mi_clz_generic(size_t x);
 size_t _mi_ctz_generic(size_t x);
 static inline size_t mi_ctz(size_t x) {
  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
    size_t r;
    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
    return r;
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    unsigned long idx;
    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
  #elif mi_has_builtinz(ctz)
    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
    if (x==0) return MI_SIZE_BITS;
    size_t r;
    __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc");
    return r;
  #elif MI_HAS_FAST_POPCOUNT
    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
  #else
    #define MI_HAS_FAST_BITSCAN  0
    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
  #endif
 }
 static inline size_t mi_clz(size_t x) {
  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
    size_t r;
    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
    return r;
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    unsigned long idx;
    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
  #elif mi_has_builtinz(clz)
    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
    if (x==0) return MI_SIZE_BITS;
    size_t r;
    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
    return (MI_SIZE_BITS - 1 - r);
  #else
    #define MI_HAS_FAST_BITSCAN  0
    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
  #endif
 }
 #ifndef MI_HAS_FAST_BITSCAN
 #define MI_HAS_FAST_BITSCAN 1
 #endif
 /* --------------------------------------------------------------------------------
  find trailing/leading zero  (bit scan forward/reverse)
 -------------------------------------------------------------------------------- */
 // Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsf(size_t x, size_t* idx) {
  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
    // on x64 the carry flag is set on zero which gives better codegen
    bool is_zero;
    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
    return !is_zero;
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    unsigned long i;
    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
  #else
    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
  #endif
 }
 // Bit scan reverse: find the most significant bit that is set
 // return false if `x==0` (with `*idx` undefined) and true otherwise,
 // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
 static inline bool mi_bsr(size_t x, size_t* idx) {
  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
    // on x64 the carry flag is set on zero which gives better codegen
    bool is_zero;
    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
    return !is_zero;
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    unsigned long i;
    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
  #else
    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
  #endif
 }
 /* --------------------------------------------------------------------------------
  rotate
 -------------------------------------------------------------------------------- */
 static inline size_t mi_rotr(size_t x, size_t r) {
  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
    return mi_builtin(rotateright64)(x,r);
  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
    return mi_builtin(rotateright32)(x,r);
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
    return _rotr64(x, (int)r);
  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
    return _lrotr(x,(int)r);
  #else
    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
  #endif
 }
 static inline size_t mi_rotl(size_t x, size_t r) {
  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
    return mi_builtin(rotateleft64)(x,r);
  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
    return mi_builtin(rotateleft32)(x,r);
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
    return _rotl64(x, (int)r);
  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
    return _lrotl(x, (int)r);
  #else
    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
  #endif
 }
 static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
  #if mi_has_builtin(rotateleft32)
    return mi_builtin(rotateleft32)(x,r);
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    return _lrotl(x, (int)r);
  #else
    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
    const unsigned int rshift = (unsigned int)(r) & 31;
    return ((x << rshift) | (x >> ((-rshift) & 31)));
  #endif
 }
 #endif // MI_BITS_H
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_PRIM_H
+#ifndef MI_PRIM_H
-#define MIMALLOC_PRIM_H
+#define MI_PRIM_H
 // --------------------------------------------------------------------------
@ -117,7 +117,8 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
-
+// Is this thread part of a thread pool?
 bool _mi_prim_thread_is_in_threadpool(void);
@ -269,35 +270,42 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 // defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
  const mi_threadid_t tid = __mi_prim_thread_id();
  mi_assert_internal(tid > 1);
  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
  return tid;
 }
 // Get a unique id for the current thread.
 #if defined(MI_PRIM_THREAD_ID)
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
 }
 #elif defined(_WIN32)
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
 }
 #elif MI_USE_BUILTIN_THREAD_POINTER
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Works on most Unix based platforms with recent compilers
  return (uintptr_t)__builtin_thread_pointer();
 }
 #elif MI_HAS_TLS_SLOT
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  #if defined(__BIONIC__)
    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
@ -313,7 +321,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 #else
 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return (uintptr_t)&_mi_heap_default;
 }
@ -416,4 +424,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()
-#endif  // MIMALLOC_PRIM_H
+#endif  // MI_PRIM_H
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TRACK_H
+#ifndef MI_TRACK_H
-#define MIMALLOC_TRACK_H
+#define MI_TRACK_H
 /* ------------------------------------------------------------------------------------------------------
 Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
@ -142,4 +142,4 @@ defined, undefined, or not accessible at all:
  }
 #endif
-#endif
+#endif // MI_TRACK_H
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -5,17 +5,15 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TYPES_H
+#ifndef MI_TYPES_H
-#define MIMALLOC_TYPES_H
+#define MI_TYPES_H
 // --------------------------------------------------------------------------
 // This file contains the main type definitions for mimalloc:
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
 // mi_segment_t   : a larger chunk of memory (32GiB) from where pages
 //                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
-//                  where objects are allocated.
+//                  where objects of a single size are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
@ -23,11 +21,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "atomic.h"   // _Atomic
+#include <errno.h>    // error codes
-
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
-#ifdef _MSC_VER
+#include "atomic.h"   // _Atomic primitives
 #pragma warning(disable:4214) // bitfield is not int
 #endif
 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `sizeof(void*)`
@ -50,11 +46,17 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
-// Define MI_SECURE to enable security mitigations
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
-// #define MI_SECURE 1  // guard page around metadata
+// but protects most metadata with guard pages:
-// #define MI_SECURE 2  // guard page around each mimalloc page
+//   #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+// Level 2 has more performance impact but protect well against various buffer overflows
 // by surrounding all mimalloc pages with guard pages:
 //   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large heaps..)
 //
 // The next two levels can have more performance cost:
 //   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
 //   #define MI_SECURE 4  // checks for double free. (may be more expensive)
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@ -97,124 +99,130 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 // Enable large pages for objects between 128KiB and 512KiB. Disabled by default.
 #ifndef MI_ENABLE_LARGE_PAGES
 #define MI_ENABLE_LARGE_PAGES  0
 #endif
-// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// --------------------------------------------------------------
-// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// Sizes of internal data-structures
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
-// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// --------------------------------------------------------------
 // the owning thread).
 // #define MI_HUGE_PAGE_ABANDON 1
-
+// Sizes are for 64-bit
-// ------------------------------------------------------
+#ifndef MI_ARENA_SLICE_SHIFT
-// Platform specific values
+#ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
-// ------------------------------------------------------
+#define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 // ------------------------------------------------------
 // Size of a pointer.
 // We assume that `sizeof(void*)==sizeof(intptr_t)`
 // and it holds for all platforms we know of.
 //
 // However, the C standard only requires that:
 //  p == (void*)((intptr_t)p))
 // but we also need:
 //  i == (intptr_t)((void*)i)
 // or otherwise one might define an intptr_t type that is larger than a pointer...
 // ------------------------------------------------------
 #if INTPTR_MAX > INT64_MAX
 # define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
 #elif INTPTR_MAX == INT64_MAX
 # define MI_INTPTR_SHIFT (3)
 #elif INTPTR_MAX == INT32_MAX
 # define MI_INTPTR_SHIFT (2)
 #else
-#error platform pointers must be 32, 64, or 128 bits
+#define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
 #endif
 #ifndef MI_BCHUNK_BITS_SHIFT
 #define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
-#if SIZE_MAX == UINT64_MAX
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps are "bchunks" of 512 bits
-# define MI_SIZE_SHIFT (3)
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
-typedef int64_t  mi_ssize_t;
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 #elif SIZE_MAX == UINT32_MAX
 # define MI_SIZE_SHIFT (2)
 typedef int32_t  mi_ssize_t;
 #else
 #error platform objects must be 32 or 64 bits
 #endif
-#if (SIZE_MAX/2) > LONG_MAX
+#define MI_ARENA_MIN_OBJ_SLICES           (1)                         
-# define MI_ZU(x)  x##ULL
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)            // 32 MiB (for now, cannot cross chunk boundaries)
 # define MI_ZI(x)  x##LL
 #else
 # define MI_ZU(x)  x##UL
 # define MI_ZI(x)  x##L
 #endif
-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
 #define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)
 #define MI_KiB     (MI_ZU(1024))
 #define MI_MiB     (MI_KiB*MI_KiB)
 #define MI_GiB     (MI_MiB*MI_KiB)
 // ------------------------------------------------------
 // Main internal data-structures
 // ------------------------------------------------------
 // Main tuning parameters for segment and page sizes
 // Sizes for 64-bit, divide by two for 32-bit
 #ifndef MI_SMALL_PAGE_SHIFT
 #define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
 #endif
 #ifndef MI_MEDIUM_PAGE_SHIFT
 #define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
 #endif
 #ifndef MI_LARGE_PAGE_SHIFT
 #define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
 #endif
 #ifndef MI_SEGMENT_SHIFT
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
 #endif
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
 #define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
 #define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
 #define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
 #define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
 #define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
 #define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
 #define MI_BIN_FULL  (MI_BIN_HUGE+1)
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 #if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
 #error "mimalloc internal: define more bins"
 #endif
 // Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
 #define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
 // Minimal commit for a page on-demand commit (should be >= OS page size)
 #define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE // (4*MI_KiB) 
 // ------------------------------------------------------
 // Arena's are large reserved areas of memory allocated from
 // the OS that are managed by mimalloc to efficiently
 // allocate MI_ARENA_SLICE_SIZE slices of memory for the
 // mimalloc pages.
 // ------------------------------------------------------
 // A large memory arena where pages are allocated in.
 typedef struct mi_arena_s mi_arena_t;     // defined in `arena.c`
 // ---------------------------------------------------------------
 // a memory id tracks the provenance of arena/OS allocated memory
 // ---------------------------------------------------------------
 // Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
 // The memid keeps track of this.
 typedef enum mi_memkind_e {
  MI_MEM_NONE,      // not allocated
  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main heap data for example (`init.c`))
  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
  MI_MEM_OS,        // allocated from the OS
  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
  MI_MEM_ARENA      // allocated from an arena (the usual case) (`arena.c`)
 } mi_memkind_t;
 static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
 }
 static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
  return (memkind <= MI_MEM_STATIC);
 }
 typedef struct mi_memid_os_info {
  void*         base;               // actual base address of the block (used for offset aligned allocations)
  size_t        size;               // allocated full size
  // size_t        alignment;       // alignment at allocation
 } mi_memid_os_info_t;
 typedef struct mi_memid_arena_info {
  mi_arena_t*   arena;              // arena that contains this memory
  uint32_t      slice_index;        // slice index in the arena
  uint32_t      slice_count;        // allocated slices
 } mi_memid_arena_info_t;
 typedef struct mi_memid_meta_info {
  void*         meta_page;          // meta-page that contains the block
  uint32_t      block_index;        // block index in the meta-data page
  uint32_t      block_count;        // allocated blocks
 } mi_memid_meta_info_t;
 typedef struct mi_memid_s {
  union {
    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
  } mem;
  mi_memkind_t  memkind;
  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
  bool          initially_committed;// `true` if the memory was originally allocated as committed
  bool          initially_zero;     // `true` if the memory was originally zero initialized
 } mi_memid_t;
 static inline bool mi_memid_is_os(mi_memid_t memid) {
  return mi_memkind_is_os(memid.memkind);
 }
 static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
  return mi_memkind_needs_no_free(memid.memkind);
 }
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@ -232,48 +240,28 @@ typedef struct mi_block_s {
  mi_encoded_t next;
 } mi_block_t;
 #if MI_GUARDED
 // we always align guarded pointers in a block at an offset
 // the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
 #define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
 #define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
 #endif
 // The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`)
 // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
 // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
 #define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
 #define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
 #define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)
 #define MI_PAGE_FLAG_MASK             MI_ZU(0x07)
 typedef size_t mi_page_flags_t;
 // The delayed flags are used for efficient multi-threaded free-ing
 typedef enum mi_delayed_e {
  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abandoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;
 // The `in_full` and `has_aligned` page flags are put in a union to efficiently
 // test if both are false (`full_aligned == 0`) in the `mi_free` routine.
 #if !MI_TSAN
 typedef union mi_page_flags_s {
  uint8_t full_aligned;
  struct {
    uint8_t in_full : 1;
    uint8_t has_aligned : 1;
  } x;
 } mi_page_flags_t;
 #else
 // under thread sanitizer, use a byte for each flag to suppress warning, issue #130
 typedef union mi_page_flags_s {
  uint32_t full_aligned;
  struct {
    uint8_t in_full;
    uint8_t has_aligned;
  } x;
 } mi_page_flags_t;
 #endif
 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// Points to a list of blocks that are freed by other threads.
 // The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
 // Ownership is required before we can read any non-atomic fields in the page.
 // This way we can push a block on the thread free list and try to claim ownership
 // atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
 typedef uint8_t mi_heaptag_t;
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
 // `free` for blocks that can be allocated,
@ -291,160 +279,93 @@ typedef uintptr_t mi_thread_free_t;
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
 // Notes:
-// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
 // - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
 //   that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that
 //   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
 // - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 10 words on 64-bit which helps the page index calculations
+
 //   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
 //   at least one block that will be added, or as already been added, to
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  // "owned" by the segment
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned)
  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
-  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
-  uint16_t              reserved;          // number of blocks reserved in memory
+  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint16_t                  reserved;          // number of blocks reserved in memory
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
+  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
+  uint8_t                   retire_expire;     // expiration count for retired blocks
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                           // padding
  size_t                block_size;        // size available in each block (always `>0`)
  uint8_t*              page_start;        // start of the page area containing the blocks
  size_t                    block_size;        // size available in each block (always `>0`)
  uint8_t*                  page_start;        // start of the blocks
  mi_heaptag_t              heap_tag;          // tag of the owning heap, used to separate heaps by object type
  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
                                               // padding
  #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
  #endif
-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
-  _Atomic(uintptr_t)        xheap;
+  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
-
+  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
-  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
-  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+  mi_memid_t                memid;             // provenance of the page memory
  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
  void* padding[1];
  #endif
 } mi_page_t;
 // ------------------------------------------------------
 // Object sizes
 // ------------------------------------------------------
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
 #define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
 #define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*32)  // 160    >= sizeof(mi_page_t)
 #else
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*32)  // 128/96 >= sizeof(mi_page_t)
 #endif
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
 #define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
 #define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 #if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
 #error "mimalloc internal: define more bins"
 #endif
 // ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
+// Page kinds
 // ------------------------------------------------------
 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+  MI_PAGE_SINGLETON // page containing a single block.
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an alignment `> MI_BLOCK_ALIGNMENT_MAX`.
+                    // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
 } mi_page_kind_t;
 // ---------------------------------------------------------------
 // a memory id tracks the provenance of arena/OS allocated memory
 // ---------------------------------------------------------------
 // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
 typedef enum mi_memkind_e {
  MI_MEM_NONE,      // not allocated
  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
  MI_MEM_OS,        // allocated from the OS
  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
  MI_MEM_ARENA      // allocated from an arena (the usual case)
 } mi_memkind_t;
 static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
 }
 typedef struct mi_memid_os_info {
  void*         base;               // actual base address of the block (used for offset aligned allocations)
  size_t        size;               // full allocation size
 } mi_memid_os_info_t;
 typedef struct mi_memid_arena_info {
  size_t        block_index;        // index in the arena
  mi_arena_id_t id;                 // arena id (>= 1)
  bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
 typedef struct mi_memid_s {
  union {
    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
  } mem;
  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
  bool          initially_committed;// `true` if the memory was originally allocated as committed
  bool          initially_zero;     // `true` if the memory was originally zero initialized
  mi_memkind_t  memkind;
 } mi_memid_t;
 // ---------------------------------------------------------------
 // Segments contain mimalloc pages
 // ---------------------------------------------------------------
 typedef struct mi_subproc_s mi_subproc_t;
 // Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
 // Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
  // constant fields
  mi_memid_t           memid;            // memory id to track provenance
  bool                 allow_decommit;
  bool                 allow_purge;
  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
  mi_subproc_t*        subproc;          // segment belongs to sub process
  // segment fields
  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
  struct mi_segment_s* prev;
  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
  size_t               used;             // count of pages in use (`used <= capacity`)
  size_t               capacity;         // count of available pages (`#free + used`)
  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
  struct mi_segment_s* abandoned_os_prev;
  // layout like this to optimize access in `mi_free`
  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
 // ------------------------------------------------------
 // Heaps
 //
 // Provide first-class heaps to allocate from.
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
+//
 // Per thread, there is always a default heap that is
 // used for allocation; it is initialized to statically
 // point to an empty heap to avoid initialization checks
@ -461,8 +382,6 @@ typedef struct mi_page_queue_s {
  size_t     block_size;
 } mi_page_queue_t;
 #define MI_BIN_FULL  (MI_BIN_HUGE+1)
 // Random context
 typedef struct mi_random_cxt_s {
  uint32_t input[16];
@ -473,7 +392,7 @@ typedef struct mi_random_cxt_s {
 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@ -490,18 +409,18 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
-  mi_tld_t*             tld;
+  mi_tld_t*             tld;                                 // thread-local data
-  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
  mi_threadid_t         thread_id;                           // thread this heap belongs too
  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
  mi_random_ctx_t       random;                              // random number context used for secure allocation
  size_t                page_count;                          // total number of pages in the `pages` queues.
  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
  size_t                page_retired_max;                    // largest retired index into the `pages` array.
  size_t                generic_count;                       // how often is mimalloc_generic invoked?
  mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
  bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
  bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
  #if MI_GUARDED
  size_t                guarded_size_min;                    // minimal size for guarded objects
@ -511,45 +430,11 @@ struct mi_heap_s {
  size_t                guarded_sample_count;                // current sample count (counting down to 0)
  #endif
  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
  mi_memid_t            memid;                               // provenance of the heap struct itself (meta or os)
 };
 // ------------------------------------------------------
 // Debug
 // ------------------------------------------------------
 #if !defined(MI_DEBUG_UNINIT)
 #define MI_DEBUG_UNINIT     (0xD0)
 #endif
 #if !defined(MI_DEBUG_FREED)
 #define MI_DEBUG_FREED      (0xDF)
 #endif
 #if !defined(MI_DEBUG_PADDING)
 #define MI_DEBUG_PADDING    (0xDE)
 #endif
 #if (MI_DEBUG)
 // use our own assertion to print without memory allocation
 void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
 #define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
 #else
 #define mi_assert(x)
 #endif
 #if (MI_DEBUG>1)
 #define mi_assert_internal    mi_assert
 #else
 #define mi_assert_internal(x)
 #endif
 #if (MI_DEBUG>2)
 #define mi_assert_expensive   mi_assert
 #else
 #define mi_assert_expensive(x)
 #endif
 // ------------------------------------------------------
 // Statistics
 // ------------------------------------------------------
@ -575,82 +460,118 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 typedef struct mi_stats_s {
-  mi_stat_count_t segments;
+  mi_stat_count_t   pages;
-  mi_stat_count_t pages;
+  mi_stat_count_t   reserved;
-  mi_stat_count_t reserved;
+  mi_stat_count_t   committed;
-  mi_stat_count_t committed;
+  mi_stat_count_t   reset;
-  mi_stat_count_t reset;
+  mi_stat_count_t   purged;
-  mi_stat_count_t purged;
+  mi_stat_count_t   page_committed;
-  mi_stat_count_t page_committed;
+  mi_stat_count_t   pages_abandoned;
-  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t   threads;
-  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t   normal;
-  mi_stat_count_t threads;
+  mi_stat_count_t   huge;
-  mi_stat_count_t normal;
+  mi_stat_count_t   giant;
-  mi_stat_count_t huge;
+  mi_stat_count_t   malloc;
  mi_stat_count_t giant;
  mi_stat_count_t malloc;
  mi_stat_count_t segments_cache;
  mi_stat_counter_t pages_extended;
  mi_stat_counter_t pages_reclaim_on_alloc;
  mi_stat_counter_t pages_reclaim_on_free;
  mi_stat_counter_t pages_reabandon_full;
  mi_stat_counter_t pages_unabandon_busy_wait;
  mi_stat_counter_t mmap_calls;
  mi_stat_counter_t commit_calls;
  mi_stat_counter_t reset_calls;
  mi_stat_counter_t purge_calls;
  mi_stat_counter_t arena_purges;
  mi_stat_counter_t page_no_retire;
  mi_stat_counter_t searches;
  mi_stat_counter_t normal_count;
  mi_stat_counter_t huge_count;
  mi_stat_counter_t arena_count;
  mi_stat_counter_t arena_crossover_count;
  mi_stat_counter_t arena_rollback_count;
  mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+  mi_stat_count_t normal_bins[MI_BIN_COUNT];
 #endif
 } mi_stats_t;
 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
 void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
 void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
 void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc);
 void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amount)  _mi_stat_adjust_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
-#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
 #define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
 #define mi_debug_stat_adjust_increase_mt(stat,amnt,b)           __mi_stat_adjust_increase_mt( &(stat), amnt, b)
 #define mi_debug_stat_adjust_decrease_mt(stat,amnt,b)           __mi_stat_adjust_decrease_mt( &(stat), amnt, b)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
-#define mi_stat_adjuct_increase(stat,amount)  ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
-#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
 #define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
 #define mi_debug_stat_adjust_increase(stat,amnt,b)              ((void)0)
 #define mi_debug_stat_adjust_decrease(stat,amnt,b)              ((void)0)
 #endif
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
 #define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
 #define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
 #define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
 #define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount)
 #define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase( &(heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease( &(heap)->tld->stats.stat, amount)
 #define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
 #define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_stat_decrease( (heap)->tld->stats.stat, amount)
 // ------------------------------------------------------
-// Sub processes do not reclaim or visit segments
+// Sub processes use separate arena's and no heaps/pages/blocks
-// from other sub processes
+// are shared between sub processes.
 // The subprocess structure contains essentially all static variables (except per subprocess :-))
 //
 // Each thread should belong to one sub-process only
 // ------------------------------------------------------
-struct mi_subproc_s {
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
-  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
+                                // 160 arenas is enough for ~2 TiB memory
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
+
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
+typedef struct mi_subproc_s {
-  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
+  _Atomic(size_t)       arena_count;                    // current count of arena's
-  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
-  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
-  mi_memid_t         memid;                   // provenance of this memory block
+  _Atomic(int64_t)      purge_expire;                   // expiration is set if any arenas can be purged
-};
+
  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
  mi_page_t*            os_abandoned_pages;             // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
  mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
 } mi_subproc_t;
 // ------------------------------------------------------
 // Thread Local data
@ -659,34 +580,57 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 // Queue of segments
 typedef struct mi_segment_queue_s {
  mi_segment_t* first;
  mi_segment_t* last;
 } mi_segment_queue_t;
 // Segments thread local data
 typedef struct mi_segments_tld_s {
  mi_segment_queue_t  small_free;   // queue of segments with free small pages
  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
  size_t              count;        // current number of segments;
  size_t              peak_count;   // peak number of segments
  size_t              current_size; // current size of all segments
  size_t              peak_size;    // peak size of all segments
  size_t              reclaim_count;// number of reclaimed (abandoned) segments
  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
  mi_stats_t*         stats;        // points to tld stats
 } mi_segments_tld_t;
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
+  mi_threadid_t         thread_id;            // thread id of this thread
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
-  mi_segments_tld_t   segments;      // segment tld
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_stats_t          stats;         // statistics
+  unsigned long long    heartbeat;            // monotonic heartbeat count
  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
  mi_stats_t            stats;                // statistics
  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };
 /* -----------------------------------------------------------
  Error codes passed to `_mi_fatal_error`
  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
  For portability define undefined error codes using common Unix codes:
  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
 ----------------------------------------------------------- */
 #ifndef EAGAIN         // double free
 #define EAGAIN (11)
 #endif
 #ifndef ENOMEM         // out of memory
 #define ENOMEM (12)
 #endif
 #ifndef EFAULT         // corrupted free-list or meta-data
 #define EFAULT (14)
 #endif
 #ifndef EINVAL         // trying to free an invalid pointer
 #define EINVAL (22)
 #endif
 #ifndef EOVERFLOW      // count*size overflow
 #define EOVERFLOW (75)
 #endif
 // ------------------------------------------------------
 // Debug
 // ------------------------------------------------------
 #ifndef MI_DEBUG_UNINIT
 #define MI_DEBUG_UNINIT     (0xD0)
 #endif
 #ifndef MI_DEBUG_FREED
 #define MI_DEBUG_FREED      (0xDF)
 #endif
 #ifndef MI_DEBUG_PADDING
 #define MI_DEBUG_PADDING    (0xDE)
 #endif
 #endif // MI_TYPES_H
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -16,21 +16,22 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size
  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
  if (alignment > size) return false;
  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
  const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize));
  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
  return ok;
 }
 #if MI_GUARDED
 static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
  // use over allocation for guarded blocksl
-  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
  const size_t oversize = size + alignment - 1;
  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
-  void* p = mi_align_up_ptr(base, alignment);
+  void* p = _mi_align_up_ptr(base, alignment);
  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
  mi_assert_internal(mi_usable_size(p) >= size);
  mi_assert_internal(_mi_is_aligned(p, alignment));
@ -59,21 +60,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  void* p;
  size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
-    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
-    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned
-    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
    if mi_unlikely(offset != 0) {
      // todo: cannot support offset alignment for very large alignments yet
-#if MI_DEBUG > 0
+      #if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
-#endif
+      #endif
      return NULL;
    }
    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
    // note: no guarded as alignment > 0
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
+    p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block
    // zero afterwards as only the area from the aligned_p may be committed!
    if (p == NULL) return NULL;
  }
  else {
@ -114,13 +114,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  #endif
  // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
-    // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
+  //  mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
+  //  if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
+  //  }
-  }
+  //}
  if (p != aligned_p) {
    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
@ -177,12 +177,14 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  }
  #if MI_GUARDED
-  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_heap_malloc_use_guarded(heap,size)) {
    return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
  }
  #endif
  // try first if there happens to be a small block available with just the right alignment
  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
  // naturally aligned this can be often the case.
  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
    const size_t padsize = size + MI_PADDING_SIZE;
@ -191,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
      if mi_likely(is_aligned)
      {
-        #if MI_STAT>1
+        mi_debug_heap_stat_increase(heap, malloc, size);
        mi_heap_stat_increase(heap, malloc, size);
        #endif
        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
        mi_assert_internal(p != NULL);
        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
--- a/src/alloc.c
+++ b/src/alloc.c
@ -30,7 +30,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+  if (page->block_size != 0) { // not the empty heap
    mi_assert_internal(mi_page_block_size(page) >= size);
    mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
    mi_assert_internal(_mi_ptr_page(page)==page);
  }
  // check the free list
  mi_block_t* const block = page->free;
@ -82,7 +86,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
  #if (MI_STAT>0)
  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_increase(heap, normal, bsize);
    mi_heap_stat_counter_increase(heap, normal_count, 1);
    #if (MI_STAT>1)
@ -130,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
  mi_assert(size <= MI_SMALL_SIZE_MAX);
  #if MI_DEBUG
  const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local
  #endif
  #if (MI_PADDING || MI_GUARDED)
  if (size == 0) { size = sizeof(void*); }
@ -184,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
  else {
    // regular allocation
    mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id());   // heaps are thread local
    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
    mi_track_malloc(p,size,zero);
@ -268,7 +272,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
  // if p == NULL then behave as malloc.
  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  const size_t size = (p==NULL ? 0 : _mi_usable_size(p,"mi_realloc")); 
  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
    mi_assert_internal(p!=NULL);
    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
@ -615,7 +619,6 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  block->next = MI_BLOCK_TAG_GUARDED;
  // set guard page at the end of the block
  mi_segment_t* const segment = _mi_page_segment(page);
  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
  const size_t os_page_size = _mi_os_page_size();
  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
@ -625,8 +628,11 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
    return NULL;
  }
  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
  // is ensured in `mi_arena_page_alloc_fresh`.
  mi_assert_internal(_mi_is_aligned(block, os_page_size));
  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
-  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
    _mi_os_protect(guard_page, os_page_size);
  }
  else {
@ -636,9 +642,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  // align pointer just in front of the guard page
  size_t offset = block_size - os_page_size - obj_size;
  mi_assert_internal(offset > sizeof(mi_block_t));
-  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
    // give up to place it right in front of the guard page if the offset is too large for unalignment
-    offset = MI_BLOCK_ALIGNMENT_MAX;
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
  }
  void* p = (uint8_t*)block + offset;  
  mi_track_align(block, p, offset, obj_size);
@ -659,7 +665,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
  if (block==NULL) return NULL;
-  void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);
  // stats
  mi_track_malloc(p, size, zero);  
@ -668,7 +674,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
    #if MI_STAT>1
    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
    #endif
-    _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1);
+    mi_heap_stat_counter_increase(heap, guarded_alloc_count, 1);
  }
  #if MI_DEBUG>3
  if (p != NULL && zero) {
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@ -1,346 +0,0 @@
 /* ----------------------------------------------------------------------------
 Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #if !defined(MI_IN_ARENA_C)
 #error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
 // add includes help an IDE
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 #endif
 // Minimal exports for arena-abandoned.
 size_t      mi_arena_id_index(mi_arena_id_t id);
 mi_arena_t* mi_arena_from_index(size_t idx);
 size_t      mi_arena_get_count(void);
 void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
 bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
 /* -----------------------------------------------------------
  Abandoned blocks/segments:
  _mi_arena_segment_clear_abandoned
  _mi_arena_segment_mark_abandoned
  This is used to atomically abandon/reclaim segments
  (and crosses the arena API but it is convenient to have here).
  Abandoned segments still have live blocks; they get reclaimed
  when a thread frees a block in it, or when a thread needs a fresh
  segment.
  Abandoned segments are atomically marked in the `block_abandoned`
  bitmap of arenas. Any segments allocated outside arenas are put
  in the sub-process `abandoned_os_list`. This list is accessed
  using locks but this should be uncommon and generally uncontended.
  Reclaim and visiting either scan through the `block_abandoned`
  bitmaps of the arena's, or visit the `abandoned_os_list`
  A potentially nicer design is to use arena's for everything
  and perhaps have virtual arena's to map OS allocated memory
  but this would lack the "density" of our current arena's. TBC.
 ----------------------------------------------------------- */
 // reclaim a specific OS abandoned segment; `true` on success.
 // sets the thread_id.
 static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
  // not in an arena, remove from list of abandoned os segments
  mi_subproc_t* const subproc = segment->subproc;
  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
    return false;  // failed to acquire the lock, we just give up
  }
  // remove atomically from the abandoned os list (if possible!)
  bool reclaimed = false;
  mi_segment_t* const next = segment->abandoned_os_next;
  mi_segment_t* const prev = segment->abandoned_os_prev;
  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
    #if MI_DEBUG>3
    // find ourselves in the abandoned list (and check the count)
    bool found = false;
    size_t count = 0;
    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
      if (current == segment) { found = true; }
      count++;
    }
    mi_assert_internal(found);
    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
    #endif
    // remove (atomically) from the list and reclaim
    if (prev != NULL) { prev->abandoned_os_next = next; }
    else { subproc->abandoned_os_list = next; }
    if (next != NULL) { next->abandoned_os_prev = prev; }
    else { subproc->abandoned_os_list_tail = prev; }
    segment->abandoned_os_next = NULL;
    segment->abandoned_os_prev = NULL;
    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
    if (take_lock) { // don't reset the thread_id when iterating
      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
    }
    reclaimed = true;
  }
  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
  return reclaimed;
 }
 // reclaim a specific abandoned segment; `true` on success.
 // sets the thread_id.
 bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
  }
  // arena segment: use the blocks_abandoned bitmap.
  size_t arena_idx;
  size_t bitmap_idx;
  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
  mi_arena_t* arena = mi_arena_from_index(arena_idx);
  mi_assert_internal(arena != NULL);
  // reclaim atomically
  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
  if (was_marked) {
    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
  }
  // mi_assert_internal(was_marked);
  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
  return was_marked;
 }
 // mark a specific OS segment as abandoned
 static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
  // not in an arena; we use a list of abandoned segments
  mi_subproc_t* const subproc = segment->subproc;
  mi_lock(&subproc->abandoned_os_lock) {
    // push on the tail of the list (important for the visitor)
    mi_segment_t* prev = subproc->abandoned_os_list_tail;
    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
    mi_assert_internal(segment->abandoned_os_prev == NULL);
    mi_assert_internal(segment->abandoned_os_next == NULL);
    if (prev != NULL) { prev->abandoned_os_next = segment; }
    else { subproc->abandoned_os_list = segment; }
    subproc->abandoned_os_list_tail = segment;
    segment->abandoned_os_prev = prev;
    segment->abandoned_os_next = NULL;
    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
    mi_atomic_increment_relaxed(&subproc->abandoned_count);
    // and release the lock
  }
  return;
 }
 // mark a specific segment as abandoned
 // clears the thread_id.
 void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
 {
  mi_assert_internal(segment->used == segment->abandoned);
  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
    mi_arena_segment_os_mark_abandoned(segment);
    return;
  }
  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
  size_t arena_idx;
  size_t bitmap_idx;
  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
  mi_arena_t* arena = mi_arena_from_index(arena_idx);
  mi_assert_internal(arena != NULL);
  // set abandonment atomically
  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
  mi_assert_internal(was_unmarked);
  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
 }
 /* -----------------------------------------------------------
  Iterate through the abandoned blocks/segments using a cursor.
  This is used for reclaiming and abandoned block visiting.
 ----------------------------------------------------------- */
 // start a cursor at a randomized arena
 void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
  current->bitmap_idx = 0;
  current->subproc = subproc;
  current->visit_all = visit_all;
  current->hold_visit_lock = false;
  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
  const size_t max_arena = mi_arena_get_count();
  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
    // for a heap that is bound to one arena, only visit that arena
    current->start = mi_arena_id_index(heap->arena_id);
    current->end = current->start + 1;
    current->os_list_count = 0;
  }
  else {
    // otherwise visit all starting at a random location
    if (abandoned_count > abandoned_list_count && max_arena > 0) {
      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
      current->end = current->start + max_arena;
    }
    else {
      current->start = 0;
      current->end = 0;
    }
    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
  }
  mi_assert_internal(current->start <= max_arena);
 }
 void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
  if (current->hold_visit_lock) {
    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
    current->hold_visit_lock = false;
  }
 }
 static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
  // try to reclaim an abandoned segment in the arena atomically
  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
  // check that the segment belongs to our sub-process
  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
  if (segment->subproc != subproc) {
    // it is from another sub-process, re-mark it and continue searching
    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
    return NULL;
  }
  else {
    // success, we unabandoned a segment in our sub-process
    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
    return segment;
  }
 }
 static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
  const size_t max_arena = mi_arena_get_count();
  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
  // visit arena's (from the previous cursor)
  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
    // index wraps around
    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
    mi_arena_t* arena = mi_arena_from_index(arena_idx);
    if (arena != NULL) {
      bool has_lock = false;
      // visit the abandoned fields (starting at previous_idx)
      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
        if mi_unlikely(field != 0) { // skip zero fields quickly
          // we only take the arena lock if there are actually abandoned segments present
          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
            if (!has_lock) {
              if (previous->visit_all) {
                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
              }
              // skip to next arena
              break;
            }
          }
          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
          // visit each set bit in the field  (todo: maybe use `ctz` here?)
          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
            // pre-check if the bit is set
            size_t mask = ((size_t)1 << bit_idx);
            if mi_unlikely((field & mask) == mask) {
              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
              if (segment != NULL) {
                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
                return segment;
              }
            }
          }
        }
      }
      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
    }
  }
  return NULL;
 }
 static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
  // go through the abandoned_os_list
  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
  // The lock is released when the cursor is released.
  if (!previous->hold_visit_lock) {
    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
    if (!previous->hold_visit_lock) {
      if (previous->visit_all) {
        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
      }
      return NULL; // we cannot get the lock, give up
    }
  }
  // One list entry at a time
  while (previous->os_list_count > 0) {
    previous->os_list_count--;
    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
    mi_segment_t* segment = previous->subproc->abandoned_os_list;
    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
      mi_lock_release(&previous->subproc->abandoned_os_lock);
      return segment;
    }
    // already abandoned, try again
    mi_lock_release(&previous->subproc->abandoned_os_lock);
  }
  // done
  mi_assert_internal(previous->os_list_count == 0);
  return NULL;
 }
 // reclaim abandoned segments
 // this does not set the thread id (so it appears as still abandoned)
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
  if (previous->start < previous->end) {
    // walk the arena
    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
    if (segment != NULL) { return segment; }
  }
  // no entries in the arena's anymore, walk the abandoned OS list
  mi_assert_internal(previous->start == previous->end);
  return mi_arena_segment_clear_abandoned_next_list(previous);
 }
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
  // (unfortunately) the visit_abandoned option must be enabled from the start.
  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
    return false;
  }
  mi_arena_field_cursor_t current;
  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
  mi_segment_t* segment;
  bool ok = true;
  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
    _mi_arena_segment_mark_abandoned(segment);
  }
  _mi_arena_field_cursor_done(&current);
  return ok;
 }
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@ -0,0 +1,174 @@
 /* ----------------------------------------------------------------------------
 Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 /* ----------------------------------------------------------------------------
  We have a special "mini" allocator just for allocation of meta-data like
  the heap (`mi_heap_t`) or thread-local data (`mi_tld_t`).
  We reuse the bitmap of the arena's for allocation of 64b blocks inside
  an arena slice (64KiB).
  We always ensure that meta data is zero'd (we zero on `free`)
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 /* -----------------------------------------------------------
  Meta data allocation
 ----------------------------------------------------------- */
 #define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
 #define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
 #define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
 #define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
 #define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
 #define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
 typedef struct mi_meta_page_s  {
  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
  mi_memid_t                       memid;   // provenance of the meta-page memory itself
  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
 } mi_meta_page_t;
 static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
 #if MI_DEBUG > 1
 static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
  if (block_idx != NULL) {
    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
  }
  return mpage;
 }
 #endif 
 static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
 }
 static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
  return p;
 }
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
  // allocate a fresh arena slice
  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
  mi_memid_t memid;
  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
  if (base == NULL) return NULL;
  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
  if (!memid.initially_zero) {
    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
  }
  // guard pages
  #if MI_SECURE >= 1
  _mi_os_secure_guard_page_set_at(base, memid.is_pinned);
  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid.is_pinned);
  #endif
  // initialize the page and free block bitmap
  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
  mpage->memid = memid;
  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);  
  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
  // push atomically in front of the meta page list
  // (note: there is no ABA issue since we never free meta-pages)
  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
  do {
    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
  return mpage;
 }
 // allocate meta-data
 mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
 {
  mi_assert_internal(pmemid != NULL);
  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
  mi_meta_page_t* mpage = mpage0;
  while (mpage != NULL) {
    size_t block_idx;
    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
      // found and claimed `block_count` blocks
      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
      return mi_meta_block_start(mpage,block_idx);
    }
    else {
      mpage = mi_meta_page_next(mpage);
    }
  }
  // failed to find space in existing pages
  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
    // the page list was updated by another thread in the meantime, retry
    return _mi_meta_zalloc(size,pmemid);
  }
  // otherwise, allocate a fresh metapage and try once more
  mpage = mi_meta_page_zalloc();
  if (mpage != NULL) {
    size_t block_idx;
    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
      // found and claimed `block_count` blocks
      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
      return mi_meta_block_start(mpage,block_idx);
    }
  }
  // if all this failed, allocate from the OS
  return _mi_os_alloc(size, pmemid);
 }
 // free meta-data
 mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
  if (p==NULL) return;
  if (memid.memkind == MI_MEM_META) {
    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
    const size_t block_count = memid.mem.meta.block_count;
    const size_t block_idx   = memid.mem.meta.block_index;
    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
    mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE);
    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
  }
  else {
    _mi_arenas_free(p,size,memid);
  }
 }
 // used for debug output
 bool _mi_meta_is_meta_page(void* p) 
 {
  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
  mi_meta_page_t* mpage = mpage0;
  while (mpage != NULL) {
    if ((void*)mpage == p) return true;
    mpage = mi_meta_page_next(mpage);    
  }
  return false;
 }
--- a/src/arena.c
+++ b/src/arena.c
--- a/src/bitmap.c
+++ b/src/bitmap.c
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -1,110 +1,317 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
+Concurrent bitmap that can set/reset sequences of bits atomically
 represented as an array of fields where each field is a machine word (`size_t`)
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
 (this is used in region allocation)
 The `_across` postfixed functions do allow sequences that can cross over
 between the fields. (This is used in arena allocation)
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H
-/* -----------------------------------------------------------
+/* --------------------------------------------------------------------------------
-  Bitmap definition
+  Atomic bitmaps with release/acquire guarantees:
 ----------------------------------------------------------- */
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
      We need 16K bits to represent a 1GiB arena.
-// An atomic bitmap of `size_t` fields
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
-typedef _Atomic(size_t)  mi_bitmap_field_t;
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
      to scan for bits (perhaps) more efficiently.
-// A bitmap index is the index of the bit in a bitmap.
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
-typedef size_t mi_bitmap_index_t;
+      ranges aligned to a bfield.
-// Create a bit index.
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).
  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
 }
 static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
  return mi_bitmap_index_create_ex(idx,bitidx);
 }
-// Get the field index from a bit index.
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
-}
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
      to have no bits set. It is also allowed to briefly have a clear bit even if the
      chunk has bits set -- as long as we guarantee that the bit will be set later on;
      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).
-// Get the bit index in a bitmap field
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+      may race with another thread setting a bit in the same chunk. Therefore, when
-}
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
      then test again to catch any set bits that we may have missed.
-// Get the full bit index
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+      not find a free page even though it's there (but we accept this as we avoid taking
-  return bitmap_idx;
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
-}
+      for now).
-/* -----------------------------------------------------------
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
-  Claim a bit sequence atomically
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
----------------------------------------------------------- */
+      The minimum is 1 chunk which is a 32 MiB arena.
-// Try to atomically claim a sequence of `count` bits in a single
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
-// field at `idx` in `bitmap`. Returns `true` on success.
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+--------------------------------------------------------------------------------------------- */
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// A word-size bit field.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+typedef size_t mi_bfield_t;
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-// Set `count` bits at `bitmap_idx` to 0 atomically
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
-// Returns `true` if all `count` bits were 1 previously.
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
 #define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
 #define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
-// Returns `true` if successful when all previous `count` bits were 0.
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit
 bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
 bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-//--------------------------------------------------------------------------
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
-// the `_across` functions work on bitmaps where sequences can cross over
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s {
-// between the fields. This is used in arena allocation
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
-//--------------------------------------------------------------------------
+} mi_bchunk_t;
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
 bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-// Set `count` bits at `bitmap_idx` to 0 atomically
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
-// Returns `true` if all `count` bits were 1 previously.
+// The chunkmap is itself a chunk.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+typedef mi_bchunk_t mi_bchunkmap_t;
-// Set `count` bits at `bitmap_idx` to 1 atomically
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
 bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 #define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
 #define MI_BITMAP_MIN_CHUNK_COUNT     (1)
 #if MI_SIZE_BITS > 32
 #define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
 #else
 #define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
 #endif
 #define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
 #define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
 #define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
  mi_bchunkmap_t   chunkmap;
  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
 }
 static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
 }
 /* --------------------------------------------------------------------------------
  Atomic bitmap operations
 -------------------------------------------------------------------------------- */
 // Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
 typedef bool  mi_xset_t;
 #define MI_BIT_SET    (true)
 #define MI_BIT_CLEAR  (false)
 // Required size of a bitmap to represent `bit_count` bits.
 size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
 // Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
 // returns the size of the bitmap.
 size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
 // Not atomic so only use if still local to a thread.
 void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 // Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
 bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
 // Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
 bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
 // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 // If `already_set` is not NULL, it is set to count of bits were already all set.
 // (this is used for correct statistics if commiting over a partially committed area)
 bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
 // Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 // Is a sequence of n bits already all set/cleared?
 bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
 // Is a sequence of n bits already set?
 // (Used to check if a memory range is already committed)
 static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
 }
 // Is a sequence of n bits already clear?
 static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
 }
 static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
  return mi_bitmap_is_setN(bitmap, idx, 1);
 }
 static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
  return mi_bitmap_is_clearN(bitmap, idx, 1);
 }
 // Called once a bit is cleared to see if the memory slice can be claimed.
 typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set);
 // Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
 // If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag );
 // Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
 // This is used to delay free-ing a page that it at the same time being considered to be
 // allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
 void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
 // If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
 // Otherwise return `false` (and `*idx` is undefined).
 // Used for unloading arena's
 bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
 typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
 // Visit all set bits in a bitmap (`slice_count == 1`)
 bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
 bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
 /* ----------------------------------------------------------------------------
  Binned concurrent bitmap
  Assigns a size class to each chunk such that small blocks don't cause too
  much fragmentation since we keep chunks for larger blocks separate.
 ---------------------------------------------------------------------------- */
 // Size bins; larger bins are allowed to go into smaller bins.
 // SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
 typedef enum mi_bbin_e {
  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
  MI_BBIN_SMALL,    // slice_count == 1
  MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
  MI_BBIN_MEDIUM,   // slice_count == 8
  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- only used if MI_ENABLE_LARGE_PAGES is 1
  MI_BBIN_COUNT
 } mi_bbin_t;
 static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
  return (mi_bbin_t)((int)bbin + 1);
 }
 static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
  if (slice_count==1) return MI_BBIN_SMALL;
  if (slice_count==8) return MI_BBIN_MEDIUM;
  #if MI_ENABLE_LARGE_PAGES
  if (slice_count==MI_BFIELD_BITS) return MI_BBIN_LARGE;
  #endif
  return MI_BBIN_OTHER;
 }
 // An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
  mi_bchunkmap_t   chunkmap;
  _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT];        // 512b
  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bbitmap_t;
 static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
 }
 static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
 }
 size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
 // Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
 // returns the size of the bitmap.
 size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
 // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
 // Not atomic so only use if still local to a thread.
 void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
 // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
 bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // Is a sequence of n bits already all set/cleared?
 bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // Is a sequence of n bits already set?
 // (Used to check if a memory range is already committed)
 static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
 }
 // Is a sequence of n bits already clear?
 static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
 }
 // Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
 // `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
 bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 // Specialized versions for common bit sequence sizes
 bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
 bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
 // bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
 bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
 bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
 // Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
  // if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
  if (n==0 || n>MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
  if (n<=MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
 }
 #endif // MI_BITMAP_H
--- a/src/free.c
+++ b/src/free.c
@ -23,9 +23,6 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // Free
 // ------------------------------------------------------
 // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
@ -50,6 +47,40 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
  }
 }
 // Forward declaration for multi-threaded collect
 static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept;
 // Free a block multi-threaded
 static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
 {
  // adjust stats (after padding check and potentially recursive `mi_free` above)
  mi_stat_free(page, block);    // stat_free may access the padding
  mi_track_free_size(block, mi_page_usable_size_of(page, block));
  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
 #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
  size_t dbgsize = mi_usable_size(block);
  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
 #endif
  // push atomically on the page thread free list
  mi_thread_free_t tf_new;
  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
  do {
    mi_block_set_next(page, block, mi_tf_block(tf_old));
    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
  // and atomically try to collect the page if it was abandoned
  const bool is_owned_now = !mi_tf_is_owned(tf_old);
  if (is_owned_now) {
    mi_assert_internal(mi_page_is_abandoned(page));
    mi_free_try_collect_mt(page);
  }
 }
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
 // `page_start` and `block_size` fields; however these are constant and the page won't be
@ -57,7 +88,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
  mi_assert_internal(page!=NULL && p!=NULL);
-  size_t diff = (uint8_t*)p - page->page_start;
+  size_t diff = (uint8_t*)p - mi_page_start(page);
  size_t adjust;
  if mi_likely(page->block_size_shift != 0) {
    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@ -81,218 +112,153 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 }
 #endif
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
  MI_UNUSED(segment);
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
  mi_block_check_unguard(page, block, p);
  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
  if (p==NULL) return;  // a NULL pointer is seen as abandoned (tid==0) with a full flag set
  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
  mi_block_check_unguard(page, block, p);
-  mi_free_block_mt(page, segment, block);
+  mi_free_block_mt(page, block);
 }
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
+  if (is_local) mi_free_generic_local(page,p);
-           else mi_free_generic_mt(page,segment,p);
+           else mi_free_generic_mt(page,p);
 }
 // Get the segment data belonging to a pointer
 // This is just a single `and` in release mode but does further checks in debug mode
 // (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 {
-  MI_UNUSED(msg);
+  MI_UNUSED_RELEASE(msg);
-
+  #if MI_DEBUG
  #if (MI_DEBUG>0)
  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
    return NULL;
  }
-  #endif
+  mi_page_t* const page = _mi_safe_ptr_page(p);
-
+  if (page == NULL && p != NULL) {
-  mi_segment_t* const segment = _mi_ptr_segment(p);
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
  if mi_unlikely(segment==NULL) return segment;
  #if (MI_DEBUG>0)
  if mi_unlikely(!mi_is_in_heap_region(p)) {
    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
    }
  }
  return page;
  #else
  return _mi_ptr_page(p);
  #endif
  #if (MI_DEBUG>0 || MI_SECURE>=4)
  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
    return NULL;
  }
  #endif
  return segment;
 }
 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
  if mi_unlikely(segment==NULL) return;
-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  #if MI_PAGE_MAP_FLAT                  // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt`
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_unlikely(page==NULL) return;
  #endif
-  if mi_likely(is_local) {                        // thread-local free?
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+  if mi_likely(xtid == 0) {                        // thread-local free?  `tid==mi_page_thread_id(page) && mi_page_flags(page)==0`
-      // thread-local, aligned, and not a full page
+    // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
+    mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+    mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
-    }
+  }
-    else {
+  else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0`
-      // page is full or contains (inner) aligned blocks; use generic path
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
+    mi_free_generic_local(page, p);
-    }
+  }
  // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
    // blocks are aligned (and not a full page)
    mi_block_t* const block = (mi_block_t*)p;
    mi_free_block_mt(page,block);
  }
  else {
-    // not thread-local; use generic path
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
-    mi_free_generic_mt(page, segment, p);
+    mi_free_generic_mt(page, p);
-  }
+  }  
 }
 // return true if successful
 bool _mi_free_delayed_block(mi_block_t* block) {
  // get segment and page
  mi_assert_internal(block!=NULL);
  const mi_segment_t* const segment = _mi_ptr_segment(block);
  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
  mi_assert_internal(_mi_thread_id() == segment->thread_id);
  mi_page_t* const page = _mi_segment_page_of(segment, block);
  // Clear the no-delayed flag so delayed freeing is used again for this page.
  // This must be done before collecting the free lists on this page -- otherwise
  // some blocks may end up in the page `thread_free` list with no blocks in the
  // heap `thread_delayed_free` list which may cause the page to be never freed!
  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
    return false;
  }
  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
  _mi_page_free_collect(page, false);
  // and free the block (possibly freeing the page as well since `used` is updated)
  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
  return true;
 }
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 // Push a block that is owned by another thread on its page-local thread free
 // list or it's heap delayed free list. Such blocks are later collected by
 // the owning thread in `_mi_free_delayed_block`.
 static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
 {
  // Try to put the block on either the page-local thread free list,
  // or the heap delayed free list (if this is the first non-local free in that page)
  mi_thread_free_t tfreex;
  bool use_delayed;
  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
  do {
    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
    if mi_unlikely(use_delayed) {
      // unlikely: this only happens on the first concurrent free in a page that is in the full list
      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
    }
    else {
      // usual: directly add to page thread_free list
      mi_block_set_next(page, block, mi_tf_block(tfree));
      tfreex = mi_tf_set_block(tfree,block);
    }
  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept {
-  if mi_unlikely(use_delayed) {
+  mi_assert_internal(mi_page_is_owned(page));
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+  mi_assert_internal(mi_page_is_abandoned(page));
    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
    mi_assert_internal(heap != NULL);
    if (heap != NULL) {
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
      do {
        mi_block_set_nextx(heap,block,dfree, heap->keys);
      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
    }
-    // and reset the MI_DELAYED_FREEING flag
+  // we own the page now..
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  // safe to collect the thread atomic free list
-    do {
+  _mi_page_free_collect(page, false);  // update `used` count
-      tfreex = tfree;
+  #if MI_DEBUG > 1
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+  #endif
    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
  }
 }
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+  // 1. free if the page is free now
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
+  if (mi_page_all_free(page))
 {
  // first see if the segment was abandoned and if we can reclaim it into our thread
  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
      #if MI_HUGE_PAGE_ABANDON
      segment->page_kind != MI_PAGE_HUGE &&
      #endif
      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
  {
-    // the segment is abandoned, try to reclaim it into our heap
+    // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      _mi_arenas_page_unabandon(page);
-      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+    // we can free the page directly
-      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
+    _mi_arenas_page_free(page);
-      mi_free(block);  // recursively free as now it will be a local free in our heap
+    return;
-      return;
+  }
  // 2. if the page is not too full, we can try to reclaim it for ourselves
  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
      !mi_page_is_used_at_frac(page,8) 
      // && !mi_page_is_abandoned_mapped(page)
     )
  {
    // the page has still some blocks in use (but not too many)
    // reclaim in our heap if compatible, or otherwise abandon again
    // todo: optimize this check further?
    // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
    // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
    mi_heap_t* const heap = mi_prim_get_default_heap();
    if (heap != (mi_heap_t*)&_mi_heap_empty)  // we did not already terminate our thread (can this happen?
    {
      mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
      if ((tagheap != NULL) &&                         // don't reclaim across heap object types
          (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
          // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
          (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
         )
      {
        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
          _mi_arenas_page_unabandon(page);
          _mi_heap_page_reclaim(tagheap, page);
          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
          return;
        }
      }
    }
  }
-  // The padding check may access the non-thread-owned page for the key values.
+  // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
-  mi_check_padding(page, block);
+    !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
-
+    _mi_arenas_page_try_reabandon_to_mapped(page))
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  {
  mi_stat_free(page, block);    // stat_free may access the padding
  mi_track_free_size(block, mi_page_usable_size_of(page,block));
  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
  _mi_padding_shrink(page, block, sizeof(mi_block_t));
  if (segment->page_kind == MI_PAGE_HUGE) {
    #if MI_HUGE_PAGE_ABANDON
    // huge page segments are always abandoned and can be freed immediately
    _mi_segment_huge_page_free(segment, page, block);
    return;
    #else
    // huge pages are special as they occupy the entire segment
    // as these are large we reset the memory occupied by the page so it is available to other threads
    // (as the owning thread needs to actually free the memory later).
    _mi_segment_huge_page_reset(segment, page, block);
    #endif
  }
  else {
    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
    #endif
  }
-  // and finally free the actual block by pushing it on the owning heap
+
-  // thread_delayed free list (or heap delayed free list)
+  // not reclaimed or free'd, unown again
-  mi_free_block_delayed_mt(page,block);
+  _mi_page_unown(page);
 }
@ -316,9 +282,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  const mi_page_t* const page = mi_checked_ptr_page(p,msg);
-  if mi_unlikely(segment==NULL) return 0;
+  if mi_unlikely(page==NULL) return 0;
  const mi_page_t* const page = _mi_segment_page_of(segment, p);
  if mi_likely(!mi_page_has_aligned(page)) {
    const mi_block_t* block = (const mi_block_t*)p;
    return mi_page_usable_size_of(page, block);
@ -513,21 +478,21 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
+  #if (MI_STAT < 2)
  MI_UNUSED(block);
-#endif
+  #endif
  mi_heap_t* const heap = mi_heap_get_default();
  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
+  #if (MI_STAT>1)
  const size_t usize = mi_page_usable_size_of(page, block);
  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
+  #endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
+    #if (MI_STAT > 1)
    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
+    #endif
  }
  else {
    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
@ -535,7 +500,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  }
 }
 #else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
@ -553,7 +518,7 @@ static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
  const size_t bsize = mi_page_block_size(page);
  const size_t psize = _mi_os_page_size();
  mi_assert_internal(bsize > psize);
-  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  mi_assert_internal(!page->memid.is_pinned);
  void* gpage = (uint8_t*)block + bsize - psize;
  mi_assert_internal(_mi_is_aligned(gpage, psize));
  _mi_os_unprotect(gpage, psize);
--- a/src/heap.c
+++ b/src/heap.c
@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"  // mi_prim_get_default_heap
 #include <string.h>  // memset, memcpy
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
 #pragma warning(disable:4204)  // non-constant aggregate initializer
 #endif
@ -58,8 +55,6 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(arg2);
  MI_UNUSED(pq);
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_segment_t* segment = _mi_page_segment(page);
  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id);
  mi_assert_expensive(_mi_page_is_valid(page));
  return true;
 }
@ -98,7 +93,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  if (mi_page_all_free(page)) {
    // no more used blocks, free the page.
    // note: this will free retired pages as well.
-    _mi_page_free(page, pq, collect >= MI_FORCE);
+    _mi_page_free(page, pq);
  }
  else if (collect == MI_ABANDON) {
    // still used blocks but the thread is done; abandon the page
@ -107,14 +102,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  return true; // don't break
 }
 static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
  MI_UNUSED(arg1);
  MI_UNUSED(arg2);
  MI_UNUSED(heap);
  MI_UNUSED(pq);
  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
  return true; // don't break
 }
 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
@ -124,49 +111,19 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  _mi_deferred_free(heap, force);
  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
-  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
+  // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
  // note: never reclaim on collect but leave it to threads that need storage to reclaim
  if (
  #ifdef NDEBUG
      collect == MI_FORCE
  #else
      collect >= MI_FORCE
  #endif
    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
  {
    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
    // if all memory is freed by now, all segments should be freed.
    // note: this only collects in the current subprocess
    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
  }
  // if abandoning, mark all pages to no longer add to delayed_free
  if (collect == MI_ABANDON) {
    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
  }
  // free all current thread delayed blocks.
  // (if abandoning, after this there are no more thread-delayed references into the pages.)
  _mi_heap_delayed_free_all(heap);
  // collect retired pages
  _mi_heap_collect_retired(heap, force);
-
+  
  // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); }
  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
-  // collect segments (purge pages, this can be expensive so don't force on abandonment)
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)  
-  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
+  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); 
-
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld);
  // if forced, collect thread data cache on program-exit (or shared library unload)
  if (force && is_main_thread && mi_heap_is_backing(heap)) {
    _mi_thread_data_collect();  // collect thread data cache
  }
  // collect arenas (this is program wide so don't force purges on abandonment of threads)
  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@ -187,8 +144,12 @@ void mi_collect(bool force) mi_attr_noexcept {
 ----------------------------------------------------------- */
 mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
+  mi_heap_t* heap = mi_prim_get_default_heap();
-  return mi_prim_get_default_heap();
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
    mi_thread_init();
    heap = mi_prim_get_default_heap();
  }
  return heap;
 }
 static bool mi_heap_is_default(const mi_heap_t* heap) {
@ -201,39 +162,77 @@ mi_heap_t* mi_heap_get_backing(void) {
  mi_assert_internal(heap!=NULL);
  mi_heap_t* bheap = heap->tld->heap_backing;
  mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  mi_assert_internal(bheap->tld->thread_id == _mi_thread_id());
  return bheap;
 }
-void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
+// todo: make order of parameters consistent (but would that break compat with CPython?)
 void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
 {
  mi_assert_internal(heap!=NULL);
  mi_memid_t memid = heap->memid;
  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;
+  heap->memid = memid;
-  heap->thread_id  = _mi_thread_id();
+  heap->tld        = tld;  // avoid reading the thread-local tld during initialization
-  heap->arena_id   = arena_id;
+  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
-  heap->no_reclaim = noreclaim;
+  heap->allow_page_reclaim = !noreclaim;
-  heap->tag        = tag;
+  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0);
-  if (heap == tld->heap_backing) {
+  heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
  heap->tag        = heap_tag;
  if (heap->tld->is_in_threadpool) {
    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
    // (but abandoning is good in this case)
    heap->allow_page_reclaim = false;
    // and halve the full page retain (possibly to 0)
    if (heap->full_page_retain >= 0) {
      heap->full_page_retain = heap->full_page_retain / 4;
    }
  }
  if (heap->tld->heap_backing == NULL) {
    heap->tld->heap_backing = heap;  // first heap becomes the backing heap
    _mi_random_init(&heap->random);
  }
  else {
-    _mi_random_split(&tld->heap_backing->random, &heap->random);
+    _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
  }
  heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
+  //heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
+  //heap->keys[1] = _mi_heap_random_next(heap);*/
  _mi_heap_guarded_init(heap);
  // push on the thread local heaps list
  heap->next = heap->tld->heaps;
  heap->tld->heaps = heap;
 }
 mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld) {
  mi_assert_internal(tld!=NULL);
  mi_assert(heap_tag >= 0 && heap_tag < 256);
  // allocate and initialize a heap
  mi_memid_t memid;
  mi_heap_t* heap;
  if (arena_id == _mi_arena_id_none()) {
    heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
  }
  else {
    // heaps associated wita a specific arena are allocated in that arena
    // note: takes up at least one slice which is quite wasteful...
    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
  }
  if (heap==NULL) {
    _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
    return NULL;
  }
  heap->memid = memid;
  _mi_heap_init(heap, arena_id, allow_destroy, (uint8_t)heap_tag, tld);
  return heap;
 }
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  mi_assert_internal(bheap != NULL);
-  if (heap == NULL) return NULL;
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
  mi_assert(heap_tag >= 0 && heap_tag < 256);
  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
  return heap;
 }
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@ -246,7 +245,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
 }
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+  return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena);
 }
 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
@ -258,14 +257,14 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
  mi_assert_internal(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+  _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
-  heap->thread_delayed_free = NULL;
+  // heap->thread_delayed_free = NULL;
  heap->page_count = 0;
 }
 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
+static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) {
  mi_assert(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
@ -292,7 +291,9 @@ static void mi_heap_free(mi_heap_t* heap) {
  mi_assert_internal(heap->tld->heaps != NULL);
  // and free the used memory
-  mi_free(heap);
+  if (do_free_mem) {
    _mi_meta_free(heap, sizeof(*heap), heap->memid);
  }
 }
 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@ -319,24 +320,24 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(pq);
  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  //_mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
  // stats
  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, huge, bsize);
  }
-#if (MI_STAT)
+  #if (MI_STAT)
  _mi_page_free_collect(page, false);  // update used count
  const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
  }
  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif
  /// pretend it is all free now
  mi_assert_internal(mi_page_thread_free(page) == NULL);
@ -346,7 +347,8 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  // mi_page_free(page,false);
  page->next = NULL;
  page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+  mi_page_set_heap(page, NULL);
  _mi_arenas_page_free(page);
  return true; // keep going
 }
@ -367,7 +369,8 @@ static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_he
 void mi_heap_destroy(mi_heap_t* heap) {
  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
+  mi_assert(!heap->allow_page_reclaim);
  mi_assert(!heap->allow_page_abandon);
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  #if MI_GUARDED
@ -375,9 +378,9 @@ void mi_heap_destroy(mi_heap_t* heap) {
  mi_heap_delete(heap);
  return;
  #else
-  if (!heap->no_reclaim) {
+  if (heap->allow_page_reclaim) {
    _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap);
-    // don't free in case it may contain reclaimed pages
+    // don't free in case it may contain reclaimed pages,
    mi_heap_delete(heap);
  }
  else {
@ -387,7 +390,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
    #endif
    // free all pages
    _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+    mi_heap_free(heap,true);
  }
  #endif
 }
@ -399,7 +402,7 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
  mi_heap_t* curr = heap->tld->heaps;
  while (curr != NULL) {
    mi_heap_t* next = curr->next;
-    if (curr->no_reclaim) {
+    if (!curr->allow_page_reclaim) {
      mi_heap_destroy(curr);
    }
    else {
@ -414,44 +417,30 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
 ----------------------------------------------------------- */
 // Transfer the pages from one heap to the other
-static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+//static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
-  mi_assert_internal(heap!=NULL);
+//  mi_assert_internal(heap!=NULL);
-  if (from==NULL || from->page_count == 0) return;
+//  if (from==NULL || from->page_count == 0) return;
 //
 //  // transfer all pages by appending the queues; this will set a new heap field
 //  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
 //    mi_page_queue_t* pq = &heap->pages[i];
 //    mi_page_queue_t* append = &from->pages[i];
 //    size_t pcount = _mi_page_queue_append(heap, pq, append);
 //    heap->page_count += pcount;
 //    from->page_count -= pcount;
 //  }
 //  mi_assert_internal(from->page_count == 0);
 //
 //  // and reset the `from` heap
 //  mi_heap_reset_pages(from);
 //}
-  // reduce the size of the delayed frees
+//// are two heaps compatible with respect to heap-tag, exclusive arena etc.
-  _mi_heap_delayed_free_partial(from);
+//static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
-
+//  return (heap1->tag == heap2->tag &&                   // store same kind of objects
-  // transfer all pages by appending the queues; this will set a new heap field
+//          heap1->tld->subproc == heap2->tld->subproc && // same sub-process
-  // so threads may do delayed frees in either heap for a while.
+//          heap1->arena_id == heap2->arena_id);          // same arena preference
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+//}
  // so after this only the new heap will get delayed frees
  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
    mi_page_queue_t* pq = &heap->pages[i];
    mi_page_queue_t* append = &from->pages[i];
    size_t pcount = _mi_page_queue_append(heap, pq, append);
    heap->page_count += pcount;
    from->page_count -= pcount;
  }
  mi_assert_internal(from->page_count == 0);
  // and do outstanding delayed frees in the `from` heap
  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
  // the regular `_mi_free_delayed_block` which is safe.
  _mi_heap_delayed_free_all(from);
  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
  #endif
  // and reset the `from` heap
  mi_heap_reset_pages(from);
 }
 // are two heaps compatible with respect to heap-tag, exclusive arena etc.
 static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
  return (heap1->tag == heap2->tag &&                   // store same kind of objects
          heap1->arena_id == heap2->arena_id);          // same arena preference
 }
 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
@ -461,17 +450,11 @@ void mi_heap_delete(mi_heap_t* heap)
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  mi_heap_t* bheap = heap->tld->heap_backing;
+  // abandon all pages
-  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
+  _mi_heap_collect_abandon(heap);
-    // transfer still used pages to the backing heap
+
    mi_heap_absorb(bheap, heap);
  }
  else {
    // the backing heap abandons its pages
    _mi_heap_collect_abandon(heap);
  }
  mi_assert_internal(heap->page_count==0);
-  mi_heap_free(heap);
+  mi_heap_free(heap,true);
 }
 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
@ -485,7 +468,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 }
 /* -----------------------------------------------------------
  Load/unload heaps
 ----------------------------------------------------------- */
 void mi_heap_unload(mi_heap_t* heap) {
  mi_assert(mi_heap_is_initialized(heap));
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  if (heap->exclusive_arena == NULL) {
    _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
    return;
  }
  // abandon all pages so all thread'id in the pages are cleared
  _mi_heap_collect_abandon(heap);
  mi_assert_internal(heap->page_count==0);
  // remove from heap list
  mi_heap_free(heap, false /* but don't actually free the memory */);
  // disassociate from the current thread-local and static state
  heap->tld = NULL;
  return;
 }
 bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
  mi_assert(mi_heap_is_initialized(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
  if (heap->exclusive_arena == NULL) {
    _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
    return false;
  }
  if (heap->tld != NULL) {
    _mi_warning_message("cannot reload heaps that were not unloaded first\n");
    return false;
  }
  mi_arena_t* arena = _mi_arena_from_id(arena_id);
  if (heap->exclusive_arena != arena) {
    _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena);
    return false;
  }
  mi_assert_internal(heap->page_count==0);
  // re-associate with the current thread-local and static state
  heap->tld = mi_heap_get_default()->tld;
  // reinit direct pages (as we may be in a different process)
  mi_assert_internal(heap->page_count == 0);
  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
    heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
  }
  // push on the thread local heaps list
  heap->next = heap->tld->heaps;
  heap->tld->heaps = heap;
  return true;
 }
 /* -----------------------------------------------------------
  Analysis
@ -494,11 +533,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 // static since it is not thread safe to access heaps from other threads.
 static mi_heap_t* mi_heap_of_block(const void* p) {
  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
+  mi_page_t* page = _mi_ptr_page(p); // TODO: check pointer validity?
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  return mi_page_heap(page);
  mi_assert_internal(valid);
  if mi_unlikely(!valid) return NULL;
  return mi_page_heap(_mi_segment_page_of(segment,p));
 }
 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
@ -573,7 +609,7 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
  if (page->used == 0) return true;
  size_t psize;
-  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* const pstart = mi_page_area(page, &psize);
  mi_heap_t* const heap = mi_page_heap(page);
  const size_t bsize    = mi_page_block_size(page);
  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
--- a/src/init.c
+++ b/src/init.c
@ -11,32 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 #define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
 #define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0,
+  MI_ATOMIC_VAR_INIT(MI_PAGE_IN_FULL_QUEUE),  // xthread_id  (must set flag to catch NULL on a free)
-  false, false, false, false,
+  NULL,                   // free
-  0,       // capacity
+  0,                      // used
-  0,       // reserved capacity
+  0,                      // capacity
-  { 0 },   // flags
+  0,                      // reserved capacity
-  false,   // is_zero
+  0,                      // block size shift
-  0,       // retire_expire
+  0,                      // retire_expire
-  NULL,    // free
+  NULL,                   // local_free
-  NULL,    // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
-  0,       // used
+  0,                      // block_size
-  0,       // block size shift
+  NULL,                   // page_start
-  0,       // heap tag
+  0,                      // heap tag
-  0,       // block_size
+  false,                  // is_zero
  NULL,    // page_start
  #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
+  { 0, 0 },               // keys
  #endif
  MI_ATOMIC_VAR_INIT(0), // xthread_free
  MI_ATOMIC_VAR_INIT(0), // xheap
  NULL, NULL
  #if MI_INTPTR_SIZE==4
  , { NULL }
  #endif
  NULL,                   // xheap
  NULL, NULL,             // next, prev
  MI_ARENA_SLICE_SIZE,    // page_committed
  MI_MEMID_STATIC         // memid
 };
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@ -63,8 +62,8 @@ const mi_page_t _mi_page_empty = {
    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }
 #define MI_STAT_COUNT_NULL()  {0,0,0,0}
@ -82,12 +81,10 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }  \
  MI_STAT_COUNT_END_NULL()
 // --------------------------------------------------------
@ -99,24 +96,83 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 static mi_decl_cache_align mi_subproc_t subproc_main
 #if __cplusplus
 = { };     // empty initializer to prevent running the constructor (with msvc)
 #else
 = { 0 };   // C zero initialize
 #endif
 static mi_decl_cache_align mi_tld_t tld_empty = {
  0,                      // thread_id
  0,                      // thread_seq
  &subproc_main,          // subproc
  NULL,                   // heap_backing
  NULL,                   // heaps list
  0,                      // heartbeat
  false,                  // recurse
  false,                  // is_in_threadpool
  { MI_STATS_NULL },      // stats
  MI_MEMID_STATIC         // memid
 };
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
+  &tld_empty,             // tld
-  MI_ATOMIC_VAR_INIT(NULL),
+  NULL,                   // exclusive_arena
-  0,                // tid
+  0,                      // cookie
-  0,                // cookie
+  //{ 0, 0 },               // keys
-  0,                // arena id
+  { {0}, {0}, 0, true },  // random
-  { 0, 0 },         // keys
+  0,                      // page count
-  { {0}, {0}, 0, true }, // random
+  MI_BIN_FULL, 0,         // page retired min/max
-  0,                // page count
+  0,                      // generic count
-  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,                   // next
-  NULL,             // next
+  0,                      // full page retain
-  false,            // can reclaim
+  false,                  // can reclaim
-  0,                // tag
+  true,                   // can eager abandon
  0,                      // tag
  #if MI_GUARDED
-  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,          // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
  #endif
  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
  MI_MEMID_STATIC
 };
 extern mi_heap_t heap_main;
 static mi_decl_cache_align mi_tld_t tld_main = {
  0,                      // thread_id
  0,                      // thread_seq
  &subproc_main,          // subproc
  &heap_main,             // heap_backing
  &heap_main,             // heaps list
  0,                      // heartbeat
  false,                  // recurse
  false,                  // is_in_threadpool
  { MI_STATS_NULL },      // stats
  MI_MEMID_STATIC         // memid
 };
 mi_decl_cache_align mi_heap_t heap_main = {
  &tld_main,              // thread local data
  NULL,                   // exclusive arena
  0,                      // initial cookie
  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
  { {0x846ca68b}, {0}, 0, true },  // random
  0,                      // page count
  MI_BIN_FULL, 0,         // page retired min/max
  0,                      // generic count
  NULL,                   // next heap
  2,                      // full page retain
  true,                   // allow page reclaim
  true,                   // allow page abandon
  0,                      // tag
  #if MI_GUARDED
  0, 0, 0, 0, 0,
  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY,
  MI_MEMID_STATIC
 };
@ -127,39 +183,6 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 extern mi_heap_t _mi_heap_main;
 static mi_decl_cache_align mi_subproc_t mi_subproc_default;
 static mi_decl_cache_align mi_tld_t tld_main = {
  0, false,
  &_mi_heap_main, &_mi_heap_main,
  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
    0, 0, 0, 0, 0, &mi_subproc_default,
    &tld_main.stats
  }, // segments
  { MI_STATS_NULL }       // stats
 };
 mi_decl_cache_align mi_heap_t _mi_heap_main = {
  &tld_main,
  MI_ATOMIC_VAR_INIT(NULL),
  0,                // thread id
  0,                // initial cookie
  0,                // arena id
  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
  { {0x846ca68b}, {0}, 0, true },  // random
  0,                // page count
  MI_BIN_FULL, 0,   // page retired min/max
  NULL,             // next heap
  false,            // can reclaim
  0,                // tag
  #if MI_GUARDED
  0, 0, 0, 0, 0,
  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY
 };
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
@ -175,7 +198,7 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
  if (heap->guarded_sample_rate >= 1) {
    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
  }
-  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+  heap->guarded_sample_count = 1 + heap->guarded_sample_seed;  // count down samples
 }
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@ -204,28 +227,132 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 }
 #endif
-
+// Initialize main subproc
-static void mi_heap_main_init(void) {
+static void mi_subproc_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
-    _mi_heap_main.thread_id = _mi_thread_id();
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
-    _mi_heap_main.cookie = 1;
+    mi_lock_init(&subproc_main.os_abandoned_pages_lock);
-    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_lock_init(&subproc_main.arena_reserve_lock);
      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
    #else
      _mi_random_init(&_mi_heap_main.random);
    #endif
    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
    _mi_heap_guarded_init(&_mi_heap_main);
  }
 }
-mi_heap_t* _mi_heap_main_get(void) {
+// Initialize main tld
 static void mi_tld_main_init(void) {
  if (tld_main.thread_id == 0) {
    tld_main.thread_id = _mi_prim_thread_id();
  }
 }
 // Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
  if (heap_main.cookie == 0) {
    mi_subproc_main_init();
    mi_tld_main_init();
    // heap
    heap_main.cookie = 1;
    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
      _mi_random_init_weak(&heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
    #else
      _mi_random_init(&heap_main.random);
    #endif
    heap_main.cookie  = _mi_heap_random_next(&heap_main);
    //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
    //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
    _mi_heap_guarded_init(&heap_main);
    heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
  }
 }
 mi_heap_t* heap_main_get(void) {
  mi_heap_main_init();
-  return &_mi_heap_main;
+  return &heap_main;
 }
 /* -----------------------------------------------------------
  Thread local data
 ----------------------------------------------------------- */
 // Count current and total created threads
 static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
 static _Atomic(size_t)  thread_total_count;
 size_t  _mi_current_thread_count(void) {
  return mi_atomic_load_relaxed(&thread_count);
 }
 // The mimalloc thread local data
 mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
 // Allocate fresh tld
 static mi_tld_t* mi_tld_alloc(void) {
  mi_atomic_increment_relaxed(&thread_count);
  if (_mi_is_main_thread()) {
    return &tld_main;
  }
  else {
    // allocate tld meta-data
    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
    mi_memid_t memid;
    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
    if (tld==NULL) {
      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
      return NULL;
    }
    tld->memid = memid;
    tld->heap_backing = NULL;
    tld->heaps = NULL;
    tld->subproc = &subproc_main;
    tld->thread_id = _mi_prim_thread_id();
    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
    return tld;
  }
 }
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
  if (tld != NULL && tld != MI_TLD_INVALID) {
    _mi_stats_done(&tld->stats);
    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
  }
  #if 0
  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
  // (since we are calling this during pthread shutdown)
  // (and this could happen on other systems as well, so let's never do it)
  thread_tld = MI_TLD_INVALID;
  #endif
  mi_atomic_decrement_relaxed(&thread_count);
 }
 static mi_tld_t* mi_tld(void) {
  mi_tld_t* tld = thread_tld;
  if (tld == MI_TLD_INVALID) {
    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
    thread_tld = &tld_empty;
  }
  if (tld==&tld_empty) {
    thread_tld = tld = mi_tld_alloc();
  }
  return tld;
 }
 mi_subproc_t* _mi_subproc(void) {
  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
  //       stored in a TLS slot for example)
  mi_heap_t* heap = mi_prim_get_default_heap();
  if (heap == NULL) {
    return _mi_subproc_main();
  }
  else {
    return heap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
  }
 }
@ -233,179 +360,99 @@ mi_heap_t* _mi_heap_main_get(void) {
  Sub process
 ----------------------------------------------------------- */
 mi_subproc_t* _mi_subproc_main(void) {
  return &subproc_main;
 }
 mi_subproc_id_t mi_subproc_main(void) {
  return NULL;
 }
 mi_subproc_id_t mi_subproc_new(void) {
-  mi_memid_t memid = _mi_memid_none();
+  mi_memid_t memid;
-  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
  if (subproc == NULL) return NULL;
  subproc->memid = memid;
-  subproc->abandoned_os_list = NULL;
+  mi_lock_init(&subproc->os_abandoned_pages_lock);
-  mi_lock_init(&subproc->abandoned_os_lock);
+  mi_lock_init(&subproc->arena_reserve_lock);
  mi_lock_init(&subproc->abandoned_os_visit_lock);
  return subproc;
 }
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
-  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
  if (subproc_id == NULL) return;
  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
-  // check if there are no abandoned segments still..
+  // check if there are os pages still..
  bool safe_to_delete = false;
-  mi_lock(&subproc->abandoned_os_lock) {
+  mi_lock(&subproc->os_abandoned_pages_lock) {
-    if (subproc->abandoned_os_list == NULL) {
+    if (subproc->os_abandoned_pages == NULL) {
      safe_to_delete = true;
    }
  }
  if (!safe_to_delete) return;
  // merge stats back into the main subproc?
  _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats);
  // safe to release
  // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->abandoned_os_lock);
+  mi_lock_done(&subproc->os_abandoned_pages_lock);
-  mi_lock_done(&subproc->abandoned_os_visit_lock);
+  mi_lock_done(&subproc->arena_reserve_lock);
-  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+  _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_heap_t* heap = mi_heap_get_default();
+  mi_tld_t* tld = mi_tld();
-  if (heap == NULL) return;
+  if (tld == NULL) return;
-  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
+  mi_assert(tld->subproc == &subproc_main);
-  if (heap->tld->segments.subproc != &mi_subproc_default) return;
+  if (tld->subproc != &subproc_main) return;
-  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
+  tld->subproc = _mi_subproc_from_id(subproc_id);
 }
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Allocate heap data
 ----------------------------------------------------------- */
 // note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
 typedef struct mi_thread_data_s {
  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
  mi_tld_t   tld;
  mi_memid_t memid;  // must come last due to zero'ing
 } mi_thread_data_t;
 // Thread meta-data is allocated directly from the OS. For
 // some programs that do not use thread pools and allocate and
 // destroy many OS threads, this may causes too much overhead
 // per thread so we maintain a small cache of recently freed metadata.
 #define TD_CACHE_SIZE (32)
 static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
 static mi_thread_data_t* mi_thread_data_zalloc(void) {
  // try to find thread metadata in the cache
  bool is_zero = false;
  mi_thread_data_t* td = NULL;
  for (int i = 0; i < TD_CACHE_SIZE; i++) {
    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
    if (td != NULL) {
      // found cached allocation, try use it
      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
      if (td != NULL) {
        break;
      }
    }
  }
  // if that fails, allocate as meta data
  if (td == NULL) {
    mi_memid_t memid;
    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
    if (td == NULL) {
      // if this fails, try once more. (issue #257)
      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
      if (td == NULL) {
        // really out of memory
        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
      }
    }
    if (td != NULL) {
      td->memid = memid;
      is_zero = memid.initially_zero;
    }
  }
  if (td != NULL && !is_zero) {
    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
  }
  return td;
 }
 static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
  // try to add the thread metadata to the cache
  for (int i = 0; i < TD_CACHE_SIZE; i++) {
    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
    if (td == NULL) {
      mi_thread_data_t* expected = NULL;
      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
        return;
      }
    }
  }
  // if that fails, just free it directly
  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid);
 }
 void _mi_thread_data_collect(void) {
  // free all thread metadata from the cache
  for (int i = 0; i < TD_CACHE_SIZE; i++) {
    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
    if (td != NULL) {
      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
      if (td != NULL) {
        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid);
      }
    }
  }
 }
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_thread_heap_init(void) {
  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // mi_assert_internal(heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
    // the main heap is statically allocated
    mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
+    _mi_heap_set_default_direct(&heap_main);
    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS
+    // allocates tld data
-    mi_thread_data_t* td = mi_thread_data_zalloc();
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
-    if (td == NULL) return false;
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
    mi_tld_t* tld = mi_tld_alloc();
-    mi_tld_t*  tld = &td->tld;
+    // allocate and initialize the heap
-    mi_heap_t* heap = &td->heap;
+    mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
-    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
+
-    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    // associate the heap with this thread
    // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation)
    _mi_heap_set_default_direct(heap);
    // now that the heap is set for this thread, we can set the thread-local tld.
    thread_tld = tld;
  }
  return false;
 }
 // initialize thread local data
 void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
  tld->heap_backing = bheap;
  tld->heaps = NULL;
  tld->segments.subproc = &mi_subproc_default;
  tld->segments.stats = &tld->stats;
 }
 // Free the thread local default heap (called from `mi_thread_done`)
 static bool _mi_thread_heap_done(mi_heap_t* heap) {
  if (!mi_heap_is_initialized(heap)) return true;
  // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty);
  // switch to backing heap
  heap = heap->tld->heap_backing;
@ -425,26 +472,22 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
  mi_assert_internal(mi_heap_is_backing(heap));
  // collect if not the main thread
-  if (heap != &_mi_heap_main) {
+  if (heap != &heap_main) {
    _mi_heap_collect_abandon(heap);
  }
-  // merge stats
+  // free heap meta data
-  _mi_stats_done(&heap->tld->stats);
+  _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
-  // free if not the main thread
+  if (heap == &heap_main) {
  if (heap != &_mi_heap_main) {
    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
    mi_thread_data_free((mi_thread_data_t*)heap);
  }
  else {
    #if 0
    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
    _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    mi_assert_internal(heap->tld->heap_backing == &heap_main);
    #endif
  }
  return false;
 }
@ -458,7 +501,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //
@ -472,19 +515,14 @@ static void mi_process_setup_auto_thread_done(void) {
  if (tls_initialized) return;
  tls_initialized = true;
  _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_heap_set_default_direct(&heap_main);
 }
 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
 size_t  _mi_current_thread_count(void) {
  return mi_atomic_load_relaxed(&thread_count);
 }
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@ -497,8 +535,7 @@ void mi_thread_init(void) mi_attr_noexcept
  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_thread_heap_init()) return;  // returns true if already initialized
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);
  mi_atomic_increment_relaxed(&thread_count);
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
@ -520,14 +557,18 @@ void _mi_thread_done(mi_heap_t* heap)
  }
  // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);
  _mi_stat_decrease(&_mi_stats_main.threads, 1);
  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
+  if (heap->tld->thread_id != _mi_prim_thread_id()) return;
  // abandon the thread local heap
-  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
  mi_tld_t* tld = heap->tld;
  _mi_thread_heap_done(heap);  // returns true if already ran
  // free thread local data
  mi_tld_free(tld);
 }
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@ -580,7 +621,7 @@ void _mi_process_load(void) {
  }
  // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&heap_main.random);
 }
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@ -607,7 +648,7 @@ void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  static mi_atomic_once_t process_init;
 	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main
 	#endif
  if (!mi_atomic_once(&process_init)) return;
  _mi_process_is_initialized = true;
@ -615,8 +656,11 @@ void mi_process_init(void) mi_attr_noexcept {
  mi_process_setup_auto_thread_done();
  mi_detect_cpu_features();
-  _mi_os_init();
+  mi_subproc_main_init();
  mi_tld_main_init();
  mi_heap_main_init();
  _mi_os_init();
  _mi_page_map_init();
  #if MI_DEBUG
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
@ -627,7 +671,7 @@ void mi_process_init(void) mi_attr_noexcept {
  #endif
  mi_thread_init();
-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
@ -686,15 +730,14 @@ void mi_cdecl _mi_process_done(void) {
  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
    mi_heap_collect(heap, true /* force */);
    _mi_heap_unsafe_destroy_all(heap);     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all();
+    _mi_arenas_unsafe_destroy_all(heap->tld);
    _mi_segment_map_unsafe_destroy();
  }
  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
  _mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
  os_preloading = true; // don't call the C runtime anymore
 }
--- a/src/libc.c
+++ b/src/libc.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // --------------------------------------------------------
 // This module defines various std libc functions to reduce
-// the dependency on libc, and also prevent errors caused 
+// the dependency on libc, and also prevent errors caused
 // by some libc implementations when called before `main`
 // executes (due to malloc redirection)
 // --------------------------------------------------------
@ -83,9 +83,9 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) {
 // Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
 // This is mostly to avoid calling these when libc is not yet
 // initialized (and to reduce dependencies)
-// 
+//
-// format:      d i, p x u, s
+// format:      d i, p, x, u, s
-// prec:        z l ll L
+// type:        z l ll L
 // width:       10
 // align-left:  -
 // fill:        0
@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }
-static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end)
 {
  if (x == 0 || base == 0 || base > 16) {
    if (prefix != 0) { mi_outc(prefix, out, end); }
@ -144,8 +144,8 @@ static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char*
      mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
      x = x / base;
    }
-    if (prefix != 0) { 
+    if (prefix != 0) {
-      mi_outc(prefix, out, end); 
+      mi_outc(prefix, out, end);
    }
    size_t len = *out - start;
    // and reverse in-place
@ -171,7 +171,18 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
    char c;
    MI_NEXTC();
    if (c != '%') {
-      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+      if (c == '\\') {
        MI_NEXTC();
        switch (c) {
        case 'e': mi_outc('\x1B', &out, end); break;
        case 't': mi_outc('\t', &out, end); break;
        case 'n': mi_outc('\n', &out, end); break;
        case 'r': mi_outc('\r', &out, end); break;
        case '\\': mi_outc('\\', &out, end); break;
        default: /* ignore */ break;
        }
      }
      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
        mi_outc(c, &out, end);
      }
    }
@ -181,7 +192,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      size_t width = 0;
      char   numtype = 'd';
      char   numplus = 0;
-      bool   alignright = true; 
+      bool   alignright = true;
      if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
      if (c == '-') { alignright = false; MI_NEXTC(); }
      if (c == '0') { fill = '0'; MI_NEXTC(); }
@ -191,7 +202,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
          width = (10 * width) + (c - '0'); MI_NEXTC();
        }
        if (c == 0) break;  // extra check due to while
-      }      
+      }
      if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
      else if (c == 'l') {
        numtype = c; MI_NEXTC();
@ -199,7 +210,10 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      }
      char* start = out;
-      if (c == 's') {
+      if (c == '%') {
        mi_outc('%', &out, end);
      }
      else if (c == 's') {
        // string
        const char* s = va_arg(args, const char*);
        mi_outs(s, &out, end);
@ -273,3 +287,127 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
  _mi_vsnprintf(buf, buflen, fmt, args);
  va_end(args);
 }
 // --------------------------------------------------------
 // generic trailing and leading zero count, and popcount
 // --------------------------------------------------------
 #if !MI_HAS_FAST_BITSCAN
 static size_t mi_ctz_generic32(uint32_t x) {
  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
  static const uint8_t debruijn[32] = {
    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
  };
  if (x==0) return 32;
  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
 }
 static size_t mi_clz_generic32(uint32_t x) {
  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
  static const uint8_t debruijn[32] = {
    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
  };
  if (x==0) return 32;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
 }
 size_t _mi_ctz_generic(size_t x) {
  if (x==0) return MI_SIZE_BITS;
  #if (MI_SIZE_BITS <= 32)
    return mi_ctz_generic32((uint32_t)x);
  #else
    const uint32_t lo = (uint32_t)x;
    if (lo != 0) {
      return mi_ctz_generic32(lo);
    }
    else {
      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
    }
  #endif
 }
 size_t _mi_clz_generic(size_t x) {
  if (x==0) return MI_SIZE_BITS;
  #if (MI_SIZE_BITS <= 32)
    return mi_clz_generic32((uint32_t)x);
  #else
    const uint32_t hi = (uint32_t)(x>>32);
    if (hi != 0) {
      return mi_clz_generic32(hi);
    }
    else {
      return 32 + mi_clz_generic32((uint32_t)x);
    }
  #endif
 }
 #endif // bit scan
 #if !MI_HAS_FAST_POPCOUNT
 #if MI_SIZE_SIZE == 4
 #define mi_mask_even_bits32      (0x55555555)
 #define mi_mask_even_pairs32     (0x33333333)
 #define mi_mask_even_nibbles32   (0x0F0F0F0F)
 // sum of all the bytes in `x` if it is guaranteed that the sum < 256!
 static size_t mi_byte_sum32(uint32_t x) {
  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
  x += (x << 8);
  x += (x << 16);
  return (size_t)(x >> 24);
 }
 static size_t mi_popcount_generic32(uint32_t x) {
  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
  // into the lower bit-pair:
  x = x - ((x >> 1) & mi_mask_even_bits32);
  // add the 2-bit pair results
  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
  // add the 4-bit nibble results
  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
  // each byte now has a count of its bits, we can sum them now:
  return mi_byte_sum32(x);
 }
 size_t _mi_popcount_generic(size_t x) {
  return mi_popcount_generic32(x);
 }
 #else
 #define mi_mask_even_bits64      (0x5555555555555555)
 #define mi_mask_even_pairs64     (0x3333333333333333)
 #define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
 // sum of all the bytes in `x` if it is guaranteed that the sum < 256!
 static size_t mi_byte_sum64(uint64_t x) {
  x += (x << 8);
  x += (x << 16);
  x += (x << 32);
  return (size_t)(x >> 56);
 }
 static size_t mi_popcount_generic64(uint64_t x) {
  x = x - ((x >> 1) & mi_mask_even_bits64);
  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
  return mi_byte_sum64(x);
 }
 size_t _mi_popcount_generic(size_t x) {
  return mi_popcount_generic64(x);
 }
 #endif
 #endif // popcount
--- a/src/options.c
+++ b/src/options.c
@ -102,6 +102,14 @@ typedef struct mi_option_desc_s {
 #endif
 #endif
 #ifndef MI_DEFAULT_PAGEMAP_COMMIT
 #if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
 #define MI_DEFAULT_PAGEMAP_COMMIT 1
 #else
 #define MI_DEFAULT_PAGEMAP_COMMIT 0
 #endif
 #endif
 static mi_option_desc_t options[_mi_option_last] =
 {
@ -136,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@ -145,9 +153,8 @@ static mi_option_desc_t options[_mi_option_last] =
  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
  { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@ -162,6 +169,13 @@ static mi_option_desc_t options[_mi_option_last] =
         UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
  { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
  { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
  { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
  { 2,   UNINIT, MI_OPTION(page_full_retain) },
  { 4,   UNINIT, MI_OPTION(page_max_candidates) },
  { 0,   UNINIT, MI_OPTION(max_vabits) },
  { MI_DEFAULT_PAGEMAP_COMMIT, 
         UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },
 };
 static void mi_option_init(mi_option_desc_t* desc);
@ -416,7 +430,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[992];
  if (fmt==NULL) return;
  if (!mi_recurse_enter()) return;
  _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
@ -442,6 +456,13 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
  }
 }
 void _mi_output_message(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  mi_vfprintf(NULL, NULL, NULL, fmt, args);
  va_end(args);
 }
 void _mi_trace_message(const char* fmt, ...) {
  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
  va_list args;
--- a/src/os.c
+++ b/src/os.c
@ -9,21 +9,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
-#define mi_os_stat_increase(stat,amount)      _mi_stat_increase(&_mi_stats_main.stat, amount)
+// always use main stats for OS calls
-#define mi_os_stat_decrease(stat,amount)      _mi_stat_decrease(&_mi_stats_main.stat, amount)
+#define os_stats   (&_mi_stats_main)
 #define mi_os_stat_counter_increase(stat,inc) _mi_stat_counter_increase(&_mi_stats_main.stat, inc)
 /* -----------------------------------------------------------
  Initialization.
 ----------------------------------------------------------- */
 #ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
 #if MI_INTPTR_SIZE < 8
 #define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   32
 #else
 #define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   48
 #endif
 #endif
 #ifndef MI_DEFAULT_PHYSICAL_MEMORY
 #if MI_INTPTR_SIZE < 8
 #define MI_DEFAULT_PHYSICAL_MEMORY    4*MI_GiB
@ -37,7 +28,7 @@ static mi_os_mem_config_t mi_os_mem_config = {
  0,        // large page size (usually 2MiB)
  4096,     // allocation granularity
  MI_DEFAULT_PHYSICAL_MEMORY,
-  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  MI_MAX_VABITS,  // in `bits.h`
  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
  true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
@ -62,6 +53,18 @@ size_t _mi_os_large_page_size(void) {
  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 size_t _mi_os_guard_page_size(void) {
  const size_t gsize = _mi_os_page_size();
  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/8));
  return gsize;
 }
 size_t _mi_os_virtual_address_bits(void) {
  const size_t vbits = mi_os_mem_config.virtual_address_bits;
  mi_assert(vbits <= MI_MAX_VABITS);
  return vbits;
 }
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
  // if we have access, check the size and alignment requirements
  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
@ -91,73 +94,54 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
  mi_assert_internal(alignment != 0);
  uintptr_t mask = alignment - 1;
  if ((alignment & mask) == 0) { // power of two?
    return (sz & ~mask);
  }
  else {
    return ((sz / alignment) * alignment);
  }
 }
 static void* mi_align_down_ptr(void* p, size_t alignment) {
  return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 /* -----------------------------------------------------------
  aligned hinting
 -------------------------------------------------------------- */
 // On systems with enough virtual address bits, we can do efficient aligned allocation by using
 // the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
 // space (64TiB) we use this technique. (but see issue #939)
 #if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
 static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 // Return a MI_SEGMENT_SIZE aligned address that is probably available.
 // If this returns NULL, the OS will determine the address but on some OS's that may not be
 // properly aligned which can be more costly as it needs to be adjusted afterwards.
 // For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
 // (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
 //  in the middle of the 2TiB - 6TiB address range (see issue #372))
 #define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
 #define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
 #define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
  size = _mi_align_up(size, MI_SEGMENT_SIZE);
  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
  #if (MI_SECURE>0)
  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
  #endif
  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
    uintptr_t init = MI_HINT_BASE;
    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
    #endif
    uintptr_t expected = hint + size;
    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
  }
  if (hint%try_alignment != 0) return NULL;
  return (void*)hint;
 }
 #else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
  MI_UNUSED(try_alignment); MI_UNUSED(size);
  return NULL;
 }
-#endif
+
 // In secure mode, return the size of a guard page, otherwise 0
 size_t _mi_os_secure_guard_page_size(void) {
  #if MI_SECURE > 0
  return _mi_os_guard_page_size();
  #else
  return 0;
  #endif
 }
 // In secure mode, try to decommit an area and output a warning if this fails.
 bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned) {
  if (addr == NULL) return true;
  #if MI_SECURE > 0
  const bool ok = (is_pinned ? false : _mi_os_decommit(addr, _mi_os_secure_guard_page_size()));
  if (!ok) {
    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
  }
  return ok;
  #else
  MI_UNUSED(is_pinned);
  return true;
  #endif
 }
 // In secure mode, try to decommit an area and output a warning if this fails.
 bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned) {
  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), is_pinned);
 }
 // In secure mode, try to recommit an area
 bool _mi_os_secure_guard_page_reset_at(void* addr) {
  if (addr == NULL) return true;
  #if MI_SECURE > 0
  return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
  #else
  return true;
  #endif
 }
 // In secure mode, try to recommit an area
 bool _mi_os_secure_guard_page_reset_before(void* addr) {
  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size());
 }
 /* -----------------------------------------------------------
  Free memory
@ -186,10 +170,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
    void* base = addr;
    // different base? (due to alignment)
    if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);      
+      mi_assert(memid.mem.os.base <= addr);
      base = memid.mem.os.base;
      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) { 
+      if (memid.mem.os.size==0) {
        csize += diff;
      }
      if (still_committed) {
@ -236,8 +220,6 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
  }
  mi_os_stat_counter_increase(mmap_calls, 1);
  if (p != NULL) {
    mi_os_stat_increase(reserved, size);
@ -270,18 +252,24 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());
-  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
-  if (p == NULL) return NULL;
+
  void* p = NULL;
  if (try_direct_alloc) {
    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
  }
  // aligned already?
-  if (((uintptr_t)p % alignment) == 0) {
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
    *base = p;
  }
  else {
    // if not aligned, free it, overallocate, and unmap around it
    #if !MI_TRACK_ASAN
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    if (try_direct_alloc) {
      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
    }
    #endif
    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0)); }
    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
@ -293,10 +281,10 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;
      // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
+      // note: on Windows VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // this is handledby having the `base` field in the memid.
      *base = p; // remember the base
-      p = mi_align_up_ptr(p, alignment);
+      p = _mi_align_up_ptr(p, alignment);
      // explicitly commit only the aligned part
      if (commit) {
@ -309,7 +297,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;
      // and selectively unmap parts around the over-allocated area.
-      void* aligned_p = mi_align_up_ptr(p, alignment);
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
      size_t post_size = over_size - pre_size - mid_size;
@ -339,7 +327,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
  bool os_is_zero  = false;
  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
  }
  return p;
 }
@ -355,9 +343,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  bool os_is_large = false;
  bool os_is_zero  = false;
  void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
    memid->mem.os.base = os_base;
    // memid->mem.os.alignment = alignment;
    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
@ -365,6 +353,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  return p;
 }
 void* _mi_os_zalloc(size_t size, mi_memid_t* memid) {
  void* p = _mi_os_alloc(size, memid);
  if (p == NULL) return NULL;
  // zero the OS memory if needed
  if (!memid->initially_zero) {
    _mi_memzero_aligned(p, size);
    memid->initially_zero = true;
  }
  return p;
 }
 /* -----------------------------------------------------------
  OS aligned allocation with an offset. This is used
  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@ -374,11 +374,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 ----------------------------------------------------------- */
 void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
  mi_assert(offset <= MI_SEGMENT_SIZE);
  mi_assert(offset <= size);
  mi_assert((alignment % _mi_os_page_size()) == 0);
  *memid = _mi_memid_none();
  if (offset > MI_SEGMENT_SIZE) return NULL;
  if (offset == 0) {
    // regular aligned allocation
    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
@ -411,11 +409,11 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
  if (newsize != NULL) *newsize = 0;
  if (size == 0 || addr == NULL) return NULL;
-  // page align conservatively within the range
+  // page align conservatively within the range, or liberally straddling pages outside the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
    : mi_align_down_ptr(addr, _mi_os_page_size()));
  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+    : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
  if (diff <= 0) return NULL;
@ -526,7 +524,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
    return needs_recommit;
  }
  else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
      _mi_os_reset(p, size);
    }
    return false;  // needs no recommit
@ -591,15 +589,14 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
    start = huge_start;
    if (start == 0) {
      // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
-    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
    #endif
    }
    end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));
  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
  if (total_size != NULL) *total_size = size;
  return (uint8_t*)start;
@ -612,7 +609,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif
-// Allocate MI_SEGMENT_SIZE aligned huge pages
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
  *memid = _mi_memid_none();
  if (psize != NULL) *psize = 0;
@ -674,7 +671,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
  if (page != 0) {
    mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(start, *psize, true /* is committed */, all_zero, true /* is_large */);
    memid->memkind = MI_MEM_OS_HUGE;
    mi_assert(memid->is_pinned);
    #ifdef MI_TRACK_ASAN
@ -727,3 +724,49 @@ int _mi_os_numa_node_get(void) {
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
  return (int)numa_node;
 }
 /* ----------------------------------------------------------------------------
  Public API
 -----------------------------------------------------------------------------*/
 #if 0
 mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
 }
 static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
  mi_memid_t memid = _mi_memid_none();
  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
  if (p == NULL) return p;
  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
  if (base != NULL) { *base = memid.mem.os.base;  }
  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
  if (!memid.initially_zero && memid.initially_committed) {
    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
  }
  return p;
 }
 mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
 }
 mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
 }
 mi_decl_export void  mi_os_free(void* p, size_t size) {
  if (p==NULL || size == 0) return;
  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
  _mi_os_free(p, size, memid);
 }
 mi_decl_export void  mi_os_commit(void* p, size_t size) {
  _mi_os_commit(p, size, NULL);
 }
 mi_decl_export void  mi_os_decommit(void* p, size_t size) {
  _mi_os_decommit(p, size);
 }
 #endif
--- a/src/page-map.c
+++ b/src/page-map.c
@ -0,0 +1,329 @@
 /*----------------------------------------------------------------------------
 Copyright (c) 2023-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "bitmap.h"
 #if MI_PAGE_MAP_FLAT 
 // The page-map contains a byte for each 64kb slice in the address space.
 // For an address `a` where `ofs = _mi_page_map[a >> 16]`:
 // 0 = unused
 // 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
 // 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
 //
 // 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
 // A full 256 TiB address space (48 bit) needs a 4 GiB page map.
 // A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
 mi_decl_cache_align uint8_t* _mi_page_map = NULL;
 static void*        mi_page_map_max_address = NULL;
 static mi_memid_t   mi_page_map_memid;
 #define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
 static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
 static void mi_page_map_ensure_committed(size_t idx, size_t slice_count);
 bool _mi_page_map_init(void) {
  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
  if (vbits == 0) {
    vbits = _mi_os_virtual_address_bits();
    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
    if (vbits >= 48) { vbits = 47; }
    #endif
  }
  // Allocate the page map and commit bits
  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
  const size_t reserve_size = bitmap_size + page_map_size;
  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
  if (base==NULL) {
    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
    return false;
  }
  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
    _mi_memzero_aligned(base, reserve_size);
  }
  if (bitmap_size > 0) {
    mi_page_map_commit = (mi_bitmap_t*)base;
    _mi_os_commit(mi_page_map_commit, bitmap_size, NULL);
    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
  }
  _mi_page_map = base + bitmap_size;
  // commit the first part so NULL pointers get resolved without an access violation
  if (!commit) {
    mi_page_map_ensure_committed(0, 1);
  }
  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
  return true;
 }
 static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
  // is the page map area that contains the page address committed?
  // we always set the commit bits so we can track what ranges are in-use.
  // we only actually commit if the map wasn't committed fully already.
  if (mi_page_map_commit != NULL) {
    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
        // this may race, in which case we do multiple commits (which is ok)        
        bool is_zero;
        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
        _mi_os_commit(start, size, &is_zero);
        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }        
        mi_bitmap_set(mi_page_map_commit, i);
      }
    }
  }
  #if MI_DEBUG > 0
  _mi_page_map[idx] = 0;
  _mi_page_map[idx+slice_count-1] = 0;
  #endif
 }
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
  size_t page_size;
  *page_start = mi_page_area(page, &page_size);
  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
  return _mi_page_map_index(page);
 }
 void _mi_page_map_register(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
  if mi_unlikely(_mi_page_map == NULL) {
    if (!_mi_page_map_init()) return;
  }
  mi_assert(_mi_page_map!=NULL);
  uint8_t* page_start;
  size_t   slice_count;
  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
  mi_page_map_ensure_committed(idx, slice_count);
  // set the offsets
  for (size_t i = 0; i < slice_count; i++) {
    mi_assert_internal(i < 128);
    _mi_page_map[idx + i] = (uint8_t)(i+1);
  }
 }
 void _mi_page_map_unregister(mi_page_t* page) {
  mi_assert_internal(_mi_page_map != NULL);
  // get index and count
  uint8_t* page_start;
  size_t   slice_count;
  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
  // unset the offsets
  _mi_memzero(_mi_page_map + idx, slice_count);
 }
 void _mi_page_map_unregister_range(void* start, size_t size) {
  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
  const uintptr_t index = _mi_page_map_index(start);
  mi_page_map_ensure_committed(index, slice_count); // we commit the range in total; todo: scan the commit bits and clear only those ranges?
  _mi_memzero(&_mi_page_map[index], slice_count);
 }
 mi_page_t* _mi_safe_ptr_page(const void* p) {
  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
  const uintptr_t idx = _mi_page_map_index(p);
  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
  const uintptr_t ofs = _mi_page_map[idx];
  if mi_unlikely(ofs == 0) return NULL;
  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  return (_mi_safe_ptr_page(p) != NULL);
 }
 #else
 // A 2-level page map
 #define MI_PAGE_MAP_SUB_SIZE    (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
 mi_decl_cache_align mi_page_t*** _mi_page_map;
 static void*        mi_page_map_max_address;
 static mi_memid_t   mi_page_map_memid;
 static _Atomic(mi_bfield_t)  mi_page_map_commit; 
 static mi_page_t** mi_page_map_ensure_committed(size_t idx);
 static mi_page_t** mi_page_map_ensure_at(size_t idx);
 static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
 bool _mi_page_map_init(void) {
  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
  if (vbits == 0) {
    vbits = _mi_os_virtual_address_bits();
    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
    if (vbits >= 48) { vbits = 47; }
    #endif
  }
  // Allocate the page map and commit bits
  mi_assert(MI_MAX_VABITS >= vbits);
  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
  const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
  mi_assert(page_map_count <= MI_PAGE_MAP_COUNT);
  const size_t os_page_size = _mi_os_page_size();
  const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
  const size_t reserve_size = page_map_size + os_page_size;
  const bool commit = page_map_size <= 64*MI_KiB || 
                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit(); 
  _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
  if (_mi_page_map==NULL) {
    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
    return false;
  }
  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
    _mi_memzero_aligned(_mi_page_map, page_map_size);
  }
  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0)));
  // note: for the NULL range we only commit one OS page (in the map and sub)
  if (!mi_page_map_memid.initially_committed) {
    _mi_os_commit(&_mi_page_map[0], os_page_size, NULL);  // commit first part of the map
  }
  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 subs at the end already
  if (!mi_page_map_memid.initially_committed) {
    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
  }
  _mi_page_map[0][0] = (mi_page_t*)&_mi_page_empty;       // caught in `mi_free`
  mi_assert_internal(_mi_ptr_page(NULL)==&_mi_page_empty);
  return true;
 }
 #define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS)
 static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT; 
  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
 }
 static mi_page_t** mi_page_map_ensure_committed(size_t idx) {
  size_t bit_idx;
  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
    _mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_page_t**), NULL);
    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
  }
  return _mi_page_map[idx];
 }
 static mi_page_t** mi_page_map_ensure_at(size_t idx) {
  mi_page_t** sub = mi_page_map_ensure_committed(idx);
  if mi_unlikely(sub == NULL) {
    // sub map not yet allocated, alloc now
    mi_memid_t memid;
    sub = (mi_page_t**)_mi_os_alloc(MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), &memid);
    mi_page_t** expect = NULL;
    if (!mi_atomic_cas_strong_acq_rel(((_Atomic(mi_page_t**)*)&_mi_page_map[idx]), &expect, sub)) {
      // another thread already allocated it.. free and continue
      _mi_os_free(sub, MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), memid);
      sub = expect;
      mi_assert_internal(sub!=NULL);
    }
    if (sub == NULL) {
      _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
    }
  }
  return sub;
 }
 static void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
  // is the page map area that contains the page address committed?
  while (slice_count > 0) {
    mi_page_t** sub = mi_page_map_ensure_at(idx);
    // set the offsets for the page
    while (sub_idx < MI_PAGE_MAP_SUB_COUNT) {
      sub[sub_idx] = page;
      slice_count--; if (slice_count == 0) return;      
      sub_idx++;      
    }
    idx++; // potentially wrap around to the next idx
    sub_idx = 0; 
  }
 }
 static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
  size_t page_size;
  uint8_t* page_start = mi_page_area(page, &page_size);
  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
  return _mi_page_map_index(page, sub_idx);
 }
 void _mi_page_map_register(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
  if mi_unlikely(_mi_page_map == NULL) {
    if (!_mi_page_map_init()) return;
  }
  mi_assert(_mi_page_map!=NULL);
  size_t   slice_count;
  size_t   sub_idx;
  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
  mi_page_map_set_range(page, idx, sub_idx, slice_count);
 }
 void _mi_page_map_unregister(mi_page_t* page) {
  mi_assert_internal(_mi_page_map != NULL);
  // get index and count
  size_t slice_count;
  size_t sub_idx;
  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
  // unset the offsets
  mi_page_map_set_range(page, idx, sub_idx, slice_count);
 }
 void _mi_page_map_unregister_range(void* start, size_t size) {
  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
  size_t sub_idx;
  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
 }
 mi_page_t* _mi_safe_ptr_page(const void* p) {
  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
  size_t sub_idx;
  const size_t idx = _mi_page_map_index(p,&sub_idx);
  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
  mi_page_t** const sub = _mi_page_map[idx];
  if mi_unlikely(sub==NULL) return NULL;
  return sub[sub_idx];
 }
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  return (_mi_safe_ptr_page(p) != NULL);
 }
 #endif
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"     
+#include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file
 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }
 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }
 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
 }
 /* -----------------------------------------------------------
@ -76,7 +76,7 @@ static inline uint8_t mi_bin(size_t size) {
    bin = (uint8_t)wsize;
  }
  #endif
-  else if (wsize > MI_LARGE_OBJ_WSIZE_MAX) {
+  else if (wsize > MI_LARGE_MAX_OBJ_WSIZE) {
    bin = MI_BIN_HUGE;
  }
  else {
@ -84,8 +84,9 @@ static inline uint8_t mi_bin(size_t size) {
    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
    #endif
    wsize--;
-    // find the highest bit
+    mi_assert_internal(wsize!=0);
-    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    // find the highest bit position
    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -111,8 +112,8 @@ size_t _mi_bin_size(uint8_t bin) {
 }
 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
  }
  else {
@ -210,8 +211,8 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_heap_t* heap = mi_page_heap(page);
  if (page->prev != NULL) page->prev->next = page->next;
@ -226,7 +227,6 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  heap->page_count--;
  page->next = NULL;
  page->prev = NULL;
  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }
@ -242,7 +242,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -259,6 +259,34 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
  heap->page_count++;
 }
 static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(!mi_page_queue_contains(queue, page));
  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
  page->prev = queue->last;
  page->next = NULL;
  if (queue->last != NULL) {
    mi_assert_internal(queue->last->next == NULL);
    queue->last->next = page;
    queue->last = page;
  }
  else {
    queue->first = queue->last = page;
  }
  // update direct
  if (queue->first == page) {
    mi_heap_queue_first_update(heap, queue);
  }
  heap->page_count++;
 }
 static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(mi_page_queue_contains(queue, page));
@ -317,8 +345,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
      page->prev = to->first;
      page->next = next;
      to->first->next = page;
-      if (next != NULL) { 
+      if (next != NULL) {
-        next->prev = page; 
+        next->prev = page;
      }
      else {
        to->last = page;
@ -356,13 +384,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    mi_page_set_heap(page, heap);
    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
    // that after appending only the new heap will be used for delayed free operations.
    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
    count++;
  }
--- a/src/page.c
+++ b/src/page.c
@ -36,14 +36,15 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+//static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
  mi_assert_internal(_mi_ptr_page(page) == page);
  size_t count = 0;
  while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
    count++;
    head = mi_block_next(page, head);
  }
@ -59,7 +60,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
  size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
  mi_block_t* start = (mi_block_t*)page_area;
  mi_block_t* end   = (mi_block_t*)(page_area + psize);
  while(p != NULL) {
@ -83,10 +84,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  mi_assert_internal(page->capacity <= page->reserved);
  // const size_t bsize = mi_page_block_size(page);
-  mi_segment_t* segment = _mi_page_segment(page);
+  // uint8_t* start = mi_page_start(page);
  uint8_t* start = mi_page_start(page);
  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -121,64 +119,25 @@ bool _mi_page_is_valid(mi_page_t* page) {
  #if MI_SECURE
  mi_assert_internal(page->keys[0] != 0);
  #endif
-  if (mi_page_heap(page)!=NULL) {
+  if (!mi_page_is_abandoned(page)) {
-    mi_segment_t* segment = _mi_page_segment(page);
+    //mi_assert_internal(!_mi_process_is_initialized);
    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
    #if MI_HUGE_PAGE_ABANDON
    if (segment->page_kind != MI_PAGE_HUGE)
    #endif
    {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+      // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
  return true;
 }
 #endif
 void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
    mi_atomic_yield();
  }
 }
 bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
  mi_thread_free_t tfreex;
  mi_delayed_t     old_delay;
  mi_thread_free_t tfree;
  size_t yield_count = 0;
  do {
    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
    tfreex = mi_tf_set_delayed(tfree, delay);
    old_delay = mi_tf_delayed(tfree);
    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
      if (yield_count >= 4) return false;  // give up after 4 tries
      yield_count++;
      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
    }
    else if (delay == old_delay) {
      break; // avoid atomic operation if already equal
    }
    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
      break; // leave never-delayed flag set
    }
  } while ((old_delay == MI_DELAYED_FREEING) ||
           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
  return true; // success
 }
 /* -----------------------------------------------------------
  Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 // Collect the local `thread_free` list using an atomic exchange.
 // Note: The exchange must be done atomically as this is used right after
 // moving to the full list in `mi_page_collect_ex` and we need to
 // ensure that there was no race where the page became unfull just before the move.
 static void _mi_page_thread_free_collect(mi_page_t* page)
 {
  mi_block_t* head;
@ -186,21 +145,21 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
  do {
    head = mi_tf_block(tfree);
-    tfreex = mi_tf_set_block(tfree,NULL);
+    if (head == NULL) return; // return if the list is empty
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
+    tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
-
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
-  // return if the list is empty
+  mi_assert_internal(head != NULL);
  if (head == NULL) return;
  // find the tail -- also to get a proper count (without data races)
  size_t max_count = page->capacity; // cannot collect more than capacity
  size_t count = 1;
  mi_block_t* tail = head;
  mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) {
    count++;
    tail = next;
  }
  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
  if (count > max_count) {
    _mi_error_message(EFAULT, "corrupted thread-free list\n");
@ -219,9 +178,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  mi_assert_internal(page!=NULL);
  // collect the thread free list
-  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
+  _mi_page_thread_free_collect(page);
    _mi_page_thread_free_collect(page);
  }
  // and the local free list
  if (page->local_free != NULL) {
@ -254,43 +211,83 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  Page fresh and retire
 ----------------------------------------------------------- */
 /*
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  // mi_page_set_heap(page, heap);
  // _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
  _mi_page_free_collect(page, false); // ensure used count is up to date
  mi_assert_expensive(mi_page_is_valid_init(page));
-  mi_assert_internal(mi_page_heap(page) == heap);
+  // mi_assert_internal(mi_page_heap(page) == heap);
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  // mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
  #if MI_HUGE_PAGE_ABANDON
  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
  #endif
  // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  mi_page_queue_push(heap, pq, page);
  mi_assert_expensive(_mi_page_is_valid(page));
 }
 */
 // called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
 void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page)
 {
  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
  mi_assert_internal(_mi_ptr_page(page)==page);
  mi_assert_internal(mi_page_is_owned(page));
  mi_assert_internal(mi_page_is_abandoned(page));
  mi_page_set_heap(page,heap);
  _mi_page_free_collect(page, false); // ensure used count is up to date
  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  mi_page_queue_push_at_end(heap, pq, page);
  mi_assert_expensive(_mi_page_is_valid(page));
 }
 void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
  _mi_page_free_collect(page, false); // ensure used count is up to date
  if (mi_page_all_free(page)) {
    _mi_page_free(page, pq);
  }
  else {
    mi_page_queue_remove(pq, page);
    mi_tld_t* tld = page->heap->tld;
    mi_page_set_heap(page, NULL);
    _mi_arenas_page_abandon(page);  
    _mi_arenas_collect(false, false, tld); // allow purging
  }
 }
 // allocate a fresh page from a segment
 static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
  #if !MI_HUGE_PAGE_ABANDON
  mi_assert_internal(pq != NULL);
  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
-  #endif
+  #endif  
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
+  mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment);
  if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
    return NULL;
  }
-  #if MI_HUGE_PAGE_ABANDON
+  if (mi_page_is_abandoned(page)) {
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+    _mi_heap_page_reclaim(heap, page);
-  #endif
+    if (!mi_page_immediate_available(page)) {
-  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+      if (mi_page_is_expandable(page)) {
-  // a fresh page was found, initialize it
+        mi_page_extend_free(heap, page);
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+      }
-  mi_assert_internal(full_block_size >= block_size);
+      else {
-  mi_page_init(heap, page, full_block_size, heap->tld);
+        mi_assert(false); // should not happen?
        return NULL;
      }
    }
  }
  else if (pq != NULL) {
    mi_page_queue_push(heap, pq, page);
  }
  mi_heap_stat_increase(heap, pages, 1);
-  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
  mi_assert_expensive(_mi_page_is_valid(page));
  return page;
 }
@ -301,55 +298,21 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
  if (page==NULL) return NULL;
  mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
  return page;
 }
 /* -----------------------------------------------------------
   Do any delayed frees
   (put there by other threads if they deallocated in a full page)
 ----------------------------------------------------------- */
 void _mi_heap_delayed_free_all(mi_heap_t* heap) {
  while (!_mi_heap_delayed_free_partial(heap)) {
    mi_atomic_yield();
  }
 }
 // returns true if all delayed frees were processed
 bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
  // take over the list (note: no atomic exchange since it is often NULL)
  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
  bool all_freed = true;
  // and free them all
  while(block != NULL) {
    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
    // use internal free instead of regular one to keep stats etc correct
    if (!_mi_free_delayed_block(block)) {
      // we might already start delayed freeing while another thread has not yet
      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
      // into the delayed free list
      all_freed = false;
      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
      do {
        mi_block_set_nextx(heap, block, dfree, heap->keys);
      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
    }
    block = next;
  }
  return all_freed;
 }
 /* -----------------------------------------------------------
  Unfull, abandon, free and retire
 ----------------------------------------------------------- */
-// Move a page from the full list back to a regular list
+// Move a page from the full list back to a regular list (called from thread-local mi_free)
 void _mi_page_unfull(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_is_in_full(page));
  mi_assert_internal(!mi_page_heap(page)->allow_page_abandon);
  if (!mi_page_is_in_full(page)) return;
  mi_heap_t* heap = mi_page_heap(page);
@ -365,85 +328,40 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(!mi_page_immediate_available(page));
  mi_assert_internal(!mi_page_is_in_full(page));
  if (mi_page_is_in_full(page)) return;
  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
 }
 // Abandon a page with used blocks at the end of a thread.
 // Note: only call if it is ensured that no references exist from
 // the `page->heap->thread_delayed_free` into this page.
 // Currently only called through `mi_heap_collect_ex` which ensures this.
 void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(mi_page_heap(page) != NULL);
  mi_heap_t* pheap = mi_page_heap(page);
  // remove from our page list
  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
  mi_page_queue_remove(pq, page);
  // page is no longer associated with our heap
  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
  mi_page_set_heap(page, NULL);
 #if (MI_DEBUG>1) && !MI_TRACK_ENABLED
  // check there are no references left..
  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
    mi_assert_internal(_mi_ptr_page(block) != page);
  }
 #endif
  // and abandon it
  mi_assert_internal(mi_page_heap(page) == NULL);
  _mi_segment_page_abandon(page,segments_tld);
 }
 // force abandon a page
 void _mi_page_force_abandon(mi_page_t* page) {
  mi_heap_t* heap = mi_page_heap(page);
-  // mark page as not using delayed free
+  if (heap->allow_page_abandon) {
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+    // abandon full pages
  // ensure this page is no longer in the heap delayed free list
  _mi_heap_delayed_free_all(heap);
  // We can still access the page meta-info even if it is freed as we ensure 
  // in `mi_segment_force_abandon` that the segment is not freed (yet)
  if (page->capacity == 0) return; // it may have been freed now
  // and now unlink it from the page queue and abandon (or free)
  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  if (mi_page_all_free(page)) {
    _mi_page_free(page, pq, false);
  }
  else {
    _mi_page_abandon(page, pq);
  }
  else {
    // put full pages in a heap local queue
    if (mi_page_is_in_full(page)) return;
    mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
    _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
  }
 }
 // Free a page with no more free blocks
-void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+  // mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
  // no more aligned blocks in here
  mi_page_set_has_aligned(page, false);
  // remove from the page list
  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
  mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments;
  mi_page_queue_remove(pq, page);
  // and free it
  mi_heap_t* heap = page->heap;
  mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  _mi_arenas_page_free(page);
  _mi_arenas_collect(false, false, heap->tld);  // allow purging
 }
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@ -473,9 +391,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
  const size_t bsize = mi_page_block_size(page);
  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
    if (pq->last==page && pq->first==page) { // the only page in the queue?
      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_heap_t* heap = mi_page_heap(page);
      mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1);
      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_assert_internal(pq >= heap->pages);
      const size_t index = pq - heap->pages;
      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@ -486,7 +404,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
    }
  }
  #endif
-  _mi_page_free(page, pq, false);
+  _mi_page_free(page, pq);
 }
 // free retired pages: we don't need to look at the entire queues
@ -501,7 +419,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
      if (mi_page_all_free(page)) {
        page->retire_expire--;
        if (force || page->retire_expire == 0) {
-          _mi_page_free(pq->first, pq, force);
+          _mi_page_free(pq->first, pq);
        }
        else {
          // keep retired, update min/max
@ -519,6 +437,36 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 }
 static void mi_heap_collect_full_pages(mi_heap_t* heap) {
  // note: normally full pages get immediately abandoned and the full queue is always empty
  // this path is only used if abandoning is disabled due to a destroy-able heap or options
  // set by the user.
  mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL];
  for (mi_page_t* page = pq->first; page != NULL; ) {
    mi_page_t* next = page->next;         // get next in case we free the page
    _mi_page_free_collect(page, false);   // register concurrent free's
    // no longer full?
    if (!mi_page_is_full(page)) {
      if (mi_page_all_free(page)) {
        _mi_page_free(page, pq);
      }
      else {
        _mi_page_unfull(page);
      }
    }
    page = next;
  }
 }
 static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) {
  // call potential deferred free routines
  _mi_deferred_free(heap, false);
  // collect retired pages
  _mi_heap_collect_retired(heap, false);
  // collect full pages that had concurrent free's
  mi_heap_collect_full_pages(heap);
 }
 /* -----------------------------------------------------------
  Initialize the initial free list in a page.
  In secure mode we initialize a randomized list by
@ -531,7 +479,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
  MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -589,7 +537,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
  MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -617,7 +565,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */
 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@ -628,9 +576,9 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert(page->free == NULL);
  mi_assert(page->local_free == NULL);
  if (page->free != NULL) return;
@ -639,12 +587,12 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  size_t page_size;
  //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
+  mi_page_area(page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);
  // calculate the extend count
  const size_t bsize = mi_page_block_size(page);
-  size_t extend = page->reserved - page->capacity;
+  size_t extend = (size_t)page->reserved - page->capacity;
  mi_assert_internal(extend > 0);
  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
@ -660,56 +608,56 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
  mi_assert_internal(extend < (1UL<<16));
  // commit on demand?
  if (page->slice_committed > 0) {
    const size_t needed_size = (page->capacity + extend)*bsize;
    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
    if (needed_commit > page->slice_committed) {      
      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
      _mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL);
      page->slice_committed = needed_commit;
    }
  }
  // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+    mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
  }
  else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats);
  }
  // enable the new free list
  page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
  mi_assert_expensive(mi_page_is_valid_init(page));
 }
-// Initialize a fresh page
+// Initialize a fresh page (that is already partially initialized)
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
  mi_assert(page != NULL);
  mi_segment_t* segment = _mi_page_segment(page);
  mi_assert(segment != NULL);
  mi_assert_internal(block_size > 0);
  // set fields
  mi_page_set_heap(page, heap);
-  page->block_size = block_size;
+
  size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
-  mi_track_mem_noaccess(page->page_start,page_size);
+  mi_track_mem_noaccess(page_start,page_size);
-  mi_assert_internal(page_size / block_size < (1L<<16));
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
  page->reserved = (uint16_t)(page_size / block_size);
  mi_assert_internal(page->reserved > 0);
  #if (MI_PADDING || MI_ENCODE_FREELIST)
  page->keys[0] = _mi_heap_random_next(heap);
  page->keys[1] = _mi_heap_random_next(heap);
  #endif
  page->free_is_zero = page->is_zero_init;
  #if MI_DEBUG>2
-  if (page->is_zero_init) {
+  if (page->memid.initially_zero) {
-    mi_track_mem_defined(page->page_start, page_size);
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
  }
  #endif
  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
  }
  else {
    page->block_size_shift = 0;
  }
  mi_assert_internal(page->capacity == 0);
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
  mi_assert_internal(page->xthread_free == 1);
  mi_assert_internal(page->next == NULL);
  mi_assert_internal(page->prev == NULL);
  mi_assert_internal(page->retire_expire == 0);
@ -718,11 +666,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->keys[0] != 0);
  mi_assert_internal(page->keys[1] != 0);
  #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
  mi_assert_expensive(mi_page_is_valid_init(page));
  // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  mi_page_extend_free(heap,page);
  mi_assert(mi_page_immediate_available(page));
 }
@ -731,40 +679,29 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  Find pages with free blocks
 -------------------------------------------------------------*/
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
 #define MI_MAX_CANDIDATE_SEARCH  (4)
 // is the page not yet used up to its reserved space?
 static bool mi_page_is_expandable(const mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_internal(page->capacity <= page->reserved);
  return (page->capacity < page->reserved);
 }
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
  // search through the pages in "next fit" order
  #if MI_STAT
  size_t count = 0;
  #endif
-  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
  long full_page_retain = heap->full_page_retain;
  mi_page_t* page_candidate = NULL;  // a page with free space
  mi_page_t* page = pq->first;
  while (page != NULL)
  {
-    mi_page_t* next = page->next; // remember next
+    mi_page_t* next = page->next; // remember next (as this page can move to another queue)
    #if MI_STAT
    count++;
    #endif
-    candidate_count++;
+    candidate_limit--;
    // collect freed blocks by us and other threads
    _mi_page_free_collect(page, false);
  #if MI_MAX_CANDIDATE_SEARCH > 1
    // search up to N pages for a best candidate
    // is the local free list non-empty?
@ -773,28 +710,36 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
    // if the page is completely full, move it to the `mi_pages_full`
    // queue so we don't visit long-lived pages too often.
    if (!immediate_available && !mi_page_is_expandable(page)) {
-      mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+      full_page_retain--;
-      mi_page_to_full(page, pq);
+      if (full_page_retain < 0) {
        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
        mi_page_to_full(page, pq);
      }
    }
    else {
      // the page has free space, make it a candidate
      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
      if (page_candidate == NULL) {
        page_candidate = page;
-        candidate_count = 0;
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
      }
      else if (mi_page_all_free(page_candidate)) {
        _mi_page_free(page_candidate, pq);
        page_candidate = page;
      }
      // prefer to reuse fuller pages (in the hope the less used page gets freed)
-      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
        page_candidate = page;
      }
      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
-      if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) {
+      if (immediate_available || candidate_limit <= 0) {
        mi_assert_internal(page_candidate!=NULL);
        break;
      }
    }
-  #else
+
-    // first-fit algorithm
+  #if 0
    // first-fit algorithm without candidates
    // If the page contains free blocks, we are done
    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
      break;  // pick this one
@ -809,26 +754,32 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
    page = next;
  } // for each page
-  mi_heap_stat_counter_increase(heap, searches, count);
+  mi_debug_heap_stat_counter_increase(heap, searches, count);
  // set the page to the best candidate
  if (page_candidate != NULL) {
    page = page_candidate;
  }
-  if (page != NULL && !mi_page_immediate_available(page)) {
+  if (page != NULL) {
-    mi_assert_internal(mi_page_is_expandable(page));
+    if (!mi_page_immediate_available(page)) {
-    mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_is_expandable(page));
      mi_page_extend_free(heap, page);
    }
    mi_assert_internal(mi_page_immediate_available(page));
  }
  if (page == NULL) {
    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    if (page == NULL && first_try) {
      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
      page = mi_page_queue_find_free_ex(heap, pq, false);
      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    }
  }
  else {
    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    // move the page to the front of the queue
    mi_page_queue_move_to_front(heap, pq, page);
    page->retire_expire = 0;
@ -843,15 +794,16 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
-  mi_page_queue_t* pq = mi_page_queue(heap, size);
+  // mi_page_queue_t* pq = mi_page_queue(heap, size);
  mi_assert_internal(!mi_page_queue_is_huge(pq));
  // check the first page: we even do this with candidate search or otherwise we re-search every time
  mi_page_t* page = pq->first;
  if (page != NULL) {
   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+      mi_page_extend_free(heap, page);
      mi_assert_internal(mi_page_immediate_available(page));
    }
    else
@ -902,13 +854,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
-  size_t block_size = _mi_os_good_alloc_size(size);
+  const size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
  #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = NULL;
+  #error todo.
  #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  // mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
  mi_assert_internal(mi_page_queue_is_huge(pq));
  #endif
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@ -916,10 +868,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
    mi_assert_internal(mi_page_block_size(page) >= size);
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(mi_page_is_huge(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
+    mi_assert_internal(mi_page_is_singleton(page));
    mi_assert_internal(_mi_page_segment(page)->used==1);
    #if MI_HUGE_PAGE_ABANDON
-    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_assert_internal(mi_page_is_abandoned(page));
    mi_page_set_heap(page, NULL);
    #endif
    mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
@ -932,30 +883,30 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
  // huge allocation?
  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
-    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
-      return NULL;
+  }
-    }
+  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
-    else {
+  // huge allocation?
-      return mi_huge_page_alloc(heap,size,huge_alignment);
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
-    }
+    return mi_huge_page_alloc(heap,size,huge_alignment,pq);
  }
  else {
    // otherwise find a page with free blocks in our size segregated queues
    #if MI_PADDING
    mi_assert_internal(size >= MI_PADDING_SIZE);
    #endif
-    return mi_find_free_page(heap, size);
+    return mi_find_free_page(heap, pq);
  }
 }
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
-// very large requested alignments in which case we use a huge segment.
+// very large requested alignments in which case we use a huge singleton page.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
  mi_assert_internal(heap != NULL);
@ -967,15 +918,16 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
  }
  mi_assert_internal(mi_heap_is_initialized(heap));
-  // call potential deferred free routines
+  // collect every N generic mallocs
-  _mi_deferred_free(heap, false);
+  if mi_unlikely(heap->generic_count++ > 10000) {
-
+    heap->generic_count = 0;
-  // free delayed frees from other threads (but skip contended ones)
+    mi_heap_generic_collect(heap);
-  _mi_heap_delayed_free_partial(heap);
+  }
  // find (or allocate) a page of the right size
  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
    mi_heap_generic_collect(heap);
    mi_heap_collect(heap, true /* force */);
    page = mi_find_page(heap, size, huge_alignment);
  }
@ -988,6 +940,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
  mi_assert_internal(mi_page_immediate_available(page));
  mi_assert_internal(mi_page_block_size(page) >= size);
  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
  mi_assert_internal(_mi_ptr_page(page)==page);
  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
  if mi_unlikely(zero && mi_page_is_huge(page)) {
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@ -239,6 +239,9 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
 }
 #endif
 bool _mi_prim_thread_is_in_threadpool(void) {
  return false;
 }
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -62,6 +62,7 @@ terms of the MIT license. A copy of the license can be found in the file
  #include <sys/syscall.h>
 #endif
 #define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
@ -147,7 +148,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
    }
    #endif
  }
-  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
  config->has_overcommit = unix_detect_overcommit();
  config->has_partial_free = true;    // mmap can free in parts
  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@ -362,6 +363,9 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);
  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
  }
  *is_zero = true;
  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@ -409,7 +413,7 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
  int err = 0;
  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
  err = unix_madvise(start, size, MADV_DONTNEED);
-  #if !MI_DEBUG && !MI_SECURE
+  #if !MI_DEBUG && MI_SECURE<=2
    *needs_recommit = false;
  #else
    *needs_recommit = true;
@ -479,7 +483,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co
 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
  bool is_large = true;
  *is_zero = true;
-  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
    unsigned long numa_mask = (1UL << numa_node);
    // TODO: does `mbind` work correctly for huge OS pages? should we
@ -886,3 +890,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }
 #endif
 bool _mi_prim_thread_is_in_threadpool(void) {
  return false;
 }
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@ -277,3 +277,7 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
 }
 bool _mi_prim_thread_is_in_threadpool(void) {
  return false;
 }
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -17,6 +17,14 @@ terms of the MIT license. A copy of the license can be found in the file
 // Dynamically bind Windows API points for portability
 //---------------------------------------------
 #if defined(_MSC_VER)
 #pragma warning(disable:28159)  // don't use GetVersion
 #pragma warning(disable:4996)   // don't use GetVersion
 #endif
 static DWORD win_major_version = 6;
 static DWORD win_minor_version = 0;
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
@ -108,16 +116,25 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 // Initialize
 //---------------------------------------------
 static DWORD win_allocation_granularity = 64*MI_KiB;
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
  config->has_overcommit = false;
  config->has_partial_free = false;
  config->has_virtual_reserve = true;
  // windows version
  const DWORD win_version = GetVersion();
  win_major_version = (DWORD)(LOBYTE(LOWORD(win_version)));
  win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version)));
  // get the page size
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  if (si.dwAllocationGranularity > 0) {
    config->alloc_granularity = si.dwAllocationGranularity;
    win_allocation_granularity = si.dwAllocationGranularity;
  }
  // get virtual address bits
  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
    const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
@ -127,7 +144,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
  ULONGLONG memInKiB = 0;
  if (GetPhysicallyInstalledSystemMemory(&memInKiB)) {
    if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
-      config->physical_memory = memInKiB * MI_KiB;
+      config->physical_memory = (size_t)(memInKiB * MI_KiB);
    }
  }
  // get the VirtualAlloc2 function
@ -175,7 +192,7 @@ int _mi_prim_free(void* addr, size_t size ) {
    // the start of the region.
    MEMORY_BASIC_INFORMATION info = { 0 };
    VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
      errcode = 0;
      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
      if (err) { errcode = GetLastError(); }
@ -203,7 +220,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
  }
  #endif
  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@ -239,7 +256,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
      // success, return the address
      return p;
    }
-    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
              (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
              win_is_out_of_memory_error(GetLastError())) {
      // if committing regular memory and being out-of-memory,
@ -815,3 +832,16 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
    mi_allocator_done();
  }
 #endif
 bool _mi_prim_thread_is_in_threadpool(void) {
  #if (MI_ARCH_X64 || MI_ARCH_X86)
  if (win_major_version >= 6) {
    // check if this thread belongs to a windows threadpool
    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
    _TEB* const teb = NtCurrentTeb();
    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
    return (pool_data != NULL);
  }
  #endif
  return false;
 }
--- a/src/random.c
+++ b/src/random.c
@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
 #include <string.h>       // memset
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 static inline uint32_t rotl(uint32_t x, uint32_t shift) {
  return (x << shift) | (x >> (32 - shift));
 }
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 static void chacha_block(mi_random_ctx_t* ctx)
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
  for (size_t i = 0; i < 4; i++) {
    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
    ctx->input[i] = read32(sigma,i);
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
  ctx_new->input[12] = 0;
  ctx_new->input[13] = 0;
@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
  // and do a few randomization steps
  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
  for (uintptr_t i = 0; i < max; i++) {
--- a/src/segment-map.c
+++ b/src/segment-map.c
@ -1,136 +0,0 @@
 /* ----------------------------------------------------------------------------
 Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 /* -----------------------------------------------------------
  The following functions are to reliably find the segment or
  block that encompasses any pointer p (or NULL if it is not
  in any of our segments).
  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
  set to 1 if it contains the segment meta data.
 ----------------------------------------------------------- */
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 // Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
 #if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
 #define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
 #elif (MI_INTPTR_SIZE > 4)
 #define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
 #else
 #define MI_SEGMENT_MAP_MAX_ADDRESS    (UINT32_MAX)
 #endif
 #define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
 #define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
 #define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
 #define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)
 #define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
 #define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
 // A part of the segment map.
 typedef struct mi_segmap_part_s {
  mi_memid_t memid;
  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
 } mi_segmap_part_t;
 // Allocate parts on-demand to reduce .bss footprint
 static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
 static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
  // note: segment can be invalid or NULL.
  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
  *idx = 0;
  *bitidx = 0;  
  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
  // allocate on demand to reduce .bss footprint
  if (part == NULL) {
    if (!create_on_demand) return NULL;
    mi_memid_t memid;
    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
    if (part == NULL) return NULL;
    part->memid = memid;
    mi_segmap_part_t* expected = NULL;
    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
      _mi_os_free(part, sizeof(mi_segmap_part_t), memid);
      part = expected;
      if (part == NULL) return NULL;
    }
  }
  mi_assert(part != NULL);
  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
  *idx = bitofs / MI_INTPTR_BITS;
  *bitidx = bitofs % MI_INTPTR_BITS;
  return part;
 }
 void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
  size_t index;
  size_t bitidx;
  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
  if (part == NULL) return; // outside our address range..
  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
  uintptr_t newmask;
  do {
    newmask = (mask | ((uintptr_t)1 << bitidx));
  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 void _mi_segment_map_freed_at(const mi_segment_t* segment) {
  if (segment->memid.memkind == MI_MEM_ARENA) return;
  size_t index;
  size_t bitidx;
  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
  if (part == NULL) return; // outside our address range..
  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
  uintptr_t newmask;
  do {
    newmask = (mask & ~((uintptr_t)1 << bitidx));
  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
  if (p == NULL) return NULL;
  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
  size_t index;
  size_t bitidx;
  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
  if (part == NULL) return NULL;  
  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
    return segment; // yes, allocated by us
  }
  return NULL;
 }
 // Is this a valid pointer in our heap?
 static bool mi_is_valid_pointer(const void* p) {
  // first check if it is in an arena, then check if it is OS allocated
  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
 }
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  return mi_is_valid_pointer(p);
 }
 void _mi_segment_map_unsafe_destroy(void) {
  for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) {
    mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL);
    if (part != NULL) {
      _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid);
    }
  }
 }
--- a/src/segment.c
+++ b/src/segment.c
--- a/src/static.c
+++ b/src/static.c
@ -20,10 +20,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
 #include "arena-meta.c"
 #include "bitmap.c"
 #include "heap.c"
 #include "init.c"
@ -31,9 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
 #include "page-map.c"
 #include "random.c"
 #include "segment.c"
 #include "segment-map.c"
 #include "stats.c"
 #include "prim/prim.c"
 #if MI_OSX_ZONE
--- a/src/stats.c
+++ b/src/stats.c
@ -19,85 +19,92 @@ terms of the MIT license. A copy of the license can be found in the file
  Statistics operations
 ----------------------------------------------------------- */
-static bool mi_is_in_main(void* stat) {
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+  if (amount == 0) return;
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+  // add atomically
  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
  if (amount > 0) {
    mi_atomic_addi64_relaxed(&stat->allocated, amount);
  }
  else {
    mi_atomic_addi64_relaxed(&stat->freed, -amount);
  }
 }
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
+  // add thread local
-  {
+  stat->current += amount;
-    // add atomically (for abandoned pages)
+  if (stat->current > stat->peak) stat->peak = stat->current;
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  if (amount > 0) {
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+    stat->allocated += amount;
    if (amount > 0) {
      mi_atomic_addi64_relaxed(&stat->allocated,amount);
    }
    else {
      mi_atomic_addi64_relaxed(&stat->freed, -amount);
    }
  }
  else {
-    // add thread local
+    stat->freed += -amount;
    stat->current += amount;
    if (stat->current > stat->peak) stat->peak = stat->current;
    if (amount > 0) {
      stat->allocated += amount;
    }
    else {
      stat->freed += -amount;
    }
  }
 }
 // Adjust stats to compensate; for example before committing a range,
-// first adjust downwards with parts that were already committed so 
+// first adjust downwards with parts that were already committed so
 // we avoid double counting.
-static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
+  // adjust atomically
-  {
+  mi_atomic_addi64_relaxed(&stat->current, amount);
-    // adjust atomically 
+  mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
-    mi_atomic_addi64_relaxed(&stat->current, amount);
+}
-    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+
-    mi_atomic_addi64_relaxed(&stat->freed, amount);
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
-  }
+  if (amount == 0) return;
-  else {
+  stat->current += amount;
-    // don't affect the peak
+  if (on_alloc) {
    stat->current += amount;    
    // add to both
    stat->allocated += amount;
    stat->freed += amount;    
  }
 }
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
  if (mi_is_in_main(stat)) {
    mi_atomic_addi64_relaxed( &stat->count, 1 );
    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
  }
  else {
-    stat->count++;
+    stat->freed += amount;
    stat->total += amount;
  }
 }
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
  mi_atomic_addi64_relaxed(&stat->count, 1);
  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
 }
 void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
  stat->count++;
  stat->total += amount;
 }
 void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update_mt(stat, (int64_t)amount);
 }
 void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, (int64_t)amount);
 }
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update_mt(stat, -((int64_t)amount));
 }
 void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, -((int64_t)amount));
 }
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
-  mi_stat_adjust(stat, (int64_t)amount);
+  mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc);
 }
 void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
  mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
-  mi_stat_adjust(stat, -((int64_t)amount));
+  mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc);
 }
 void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
  mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
 }
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
@ -119,7 +126,6 @@ static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  if (stats==src) return;
  mi_stat_add(&stats->segments, &src->segments,1);
  mi_stat_add(&stats->pages, &src->pages,1);
  mi_stat_add(&stats->reserved, &src->reserved, 1);
  mi_stat_add(&stats->committed, &src->committed, 1);
@ -128,11 +134,9 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
  mi_stat_add(&stats->threads, &src->threads, 1);
  mi_stat_add(&stats->malloc, &src->malloc, 1);
  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
  mi_stat_add(&stats->normal, &src->normal, 1);
  mi_stat_add(&stats->huge, &src->huge, 1);
  mi_stat_add(&stats->giant, &src->giant, 1);
@ -146,7 +150,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
  mi_stat_counter_add(&stats->searches, &src->searches, 1);
  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
  mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
@ -165,7 +169,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;
+  char buf[32]; _mi_memzero_var(buf);
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "B");
  const int64_t base = (unit == 0 ? 1000 : 1024);
@ -330,7 +334,7 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
  // wrap the output function to be line buffered
-  char buf[256];
+  char buf[256]; _mi_memzero_var(buf);
  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
  buffer.buf = buf;
  mi_output_fun* out = &mi_buffered_out;
@ -343,7 +347,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  #endif
  #if MI_STAT
  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
  mi_stat_count_t total = { 0,0,0,0 };
  mi_stat_add(&total, &stats->normal, 1);
  mi_stat_add(&total, &stats->huge, 1);
@ -357,21 +361,24 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
-  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
+  //mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
-  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, "");
  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
  mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "-reclaima", out, arg);
  mi_stat_counter_print(&stats->pages_reclaim_on_free,  "-reclaimf", out, arg);
  mi_stat_counter_print(&stats->pages_reabandon_full, "-reabandon", out, arg);
  mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "-waits", out, arg);
  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
-  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
+  mi_stat_counter_print(&stats->arena_purges, "-purges", out, arg);
-  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmap calls", out, arg);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, " -commit", out, arg);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_counter_print(&stats->reset_calls, "-reset", out, arg);
-  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
+  mi_stat_counter_print(&stats->purge_calls, "-purge", out, arg);
  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
  mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg);
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
@ -399,36 +406,37 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
 static mi_msecs_t mi_process_start; // = 0
-static mi_stats_t* mi_stats_get_default(void) {
+// return thread local stats
-  mi_heap_t* heap = mi_heap_get_default();
+static mi_stats_t* mi_get_tld_stats(void) {
-  return &heap->tld->stats;
+  return &mi_heap_get_default()->tld->stats;
 }
 static void mi_stats_merge_from(mi_stats_t* stats) {
  if (stats != &_mi_stats_main) {
    mi_stats_add(&_mi_stats_main, stats);
    memset(stats, 0, sizeof(mi_stats_t));
  }
 }
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
+  mi_stats_t* stats = mi_get_tld_stats();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  mi_subproc_t* subproc = _mi_subproc();
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
-void mi_stats_merge(void) mi_attr_noexcept {
+void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) {
-  mi_stats_merge_from( mi_stats_get_default() );
+  if (to != from) {
    mi_stats_add(to, from);
    _mi_memzero(from, sizeof(mi_stats_t));
  }
 }
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+  _mi_stats_merge_from(&_mi_subproc()->stats, stats);
 }
 void mi_stats_merge(void) mi_attr_noexcept {
  _mi_stats_done( mi_get_tld_stats() );
 }
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
+  mi_stats_merge();
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 void mi_stats_print(void* out) mi_attr_noexcept {
@ -437,7 +445,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }
@ -471,11 +479,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
  mi_subproc_t* subproc = _mi_subproc();
  mi_process_info_t pinfo;
  _mi_memzero_var(pinfo);
  pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
  pinfo.current_rss    = pinfo.current_commit;
  pinfo.peak_rss       = pinfo.peak_commit;
  pinfo.utime          = 0;
@ -483,7 +492,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
  pinfo.page_faults    = 0;
  _mi_prim_process_info(&pinfo);
-  
+
  if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
  if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
  if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -50,7 +50,6 @@ int main() {
  // mi_bins();
  void* p1 = malloc(78);
  void* p2 = malloc(24);
  free(p1);
@ -83,7 +82,7 @@ int main() {
 static void invalid_free() {
  free((void*)0xBADBEEF);
-  realloc((void*)0xBADBEEF,10);
+  realloc((void*)0xBADBEEF, 10);
 }
 static void block_overflow1() {
@ -181,7 +180,7 @@ static void test_process_info(void) {
  size_t peak_commit = 0;
  size_t page_faults = 0;
  for (int i = 0; i < 100000; i++) {
-    void* p = calloc(100,10);
+    void* p = calloc(100, 10);
    free(p);
  }
  mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
@ -239,8 +238,8 @@ static void test_heap_walk(void) {
 }
 static void test_canary_leak(void) {
-  char* p = mi_mallocn_tp(char,23);
+  char* p = mi_mallocn_tp(char, 22);
-  for(int i = 0; i < 23; i++) {
+  for (int i = 0; i < 22; i++) {
    p[i] = '0'+i;
  }
  puts(p);
@ -286,15 +285,15 @@ static void test_manage_os_memory(void) {
 static void test_large_pages(void) {
  mi_memid_t memid;
-  #if 0
+#if 0
  size_t pages_reserved;
  size_t page_size;
  uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
  const size_t req_size = pages_reserved * page_size;
-  #else
+#else
  const size_t req_size = 64*MI_MiB;
-  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL);
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL);
-  #endif
+#endif
  p[0] = 1;
@ -317,8 +316,8 @@ static void test_large_pages(void) {
 #if 0
 #include <stdint.h>
 #include <stdbool.h>
 #include <mimalloc/bits.h>
 #define MI_INTPTR_SIZE 8
 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)
 #define MI_BIN_HUGE 100
@ -370,8 +369,6 @@ uint8_t _mi_bsr(uintptr_t x) {
  #endif
 }
 static inline size_t _mi_wsize_from_size(size_t size) {
  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }
@ -408,7 +405,9 @@ extern inline uint8_t _mi_bin8(size_t size) {
 #endif
    wsize--;
    // find the highest bit
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx; 
    mi_bsr(wsize, &idx);
    uint8_t b = (uint8_t)idx;
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -440,7 +439,9 @@ static inline uint8_t _mi_bin4(size_t size) {
    bin = MI_BIN_HUGE;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
    mi_bsr(wsize, &idx);
    uint8_t b = (uint8_t)idx;
    bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
  }
  return bin;
@ -456,7 +457,9 @@ static size_t _mi_binx4(size_t wsize) {
    bin = (uint8_t)wsize;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
    mi_bsr(wsize, &idx);
    uint8_t b = (uint8_t)idx;
    if (b <= 1) return wsize;
    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
  }
@ -465,7 +468,9 @@ static size_t _mi_binx4(size_t wsize) {
 static size_t _mi_binx8(size_t bsize) {
  if (bsize<=1) return bsize;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
+  size_t idx;
  mi_bsr(bsize, &idx);
  uint8_t b = (uint8_t)idx;
  if (b <= 2) return bsize;
  size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
  return bin;
@ -483,8 +488,10 @@ static inline size_t mi_bin(size_t wsize) {
  }
  else {
    wsize--;
    assert(wsize>0);
    // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr32((uint32_t)wsize);  // note: wsize != 0
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -388,7 +388,7 @@ static void test_mt_shutdown()
 // issue #372
 static void fail_aslr() {
-  size_t sz = (4ULL << 40); // 4TiB
+  uint64_t sz = (4ULL << 40); // 4TiB
  void* p = malloc(sz);
  printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz);
  *(int*)0x5FFFFFFF000 = 0;  // should segfault
--- a/test/test-api.c
+++ b/test/test-api.c
@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)
 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN
 #include "testhelper.h"
@ -169,7 +169,7 @@ int main(void) {
  /*
  CHECK_BODY("malloc-aligned6") {
    bool ok = true;
-    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
      void* ps[8];
      for (int i = 0; i < 8 && ok; i++) {
        ps[i] = mi_malloc_aligned(align*13  // size
@ -186,16 +186,16 @@ int main(void) {
  };
  */
  CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
    mi_free(p);
-    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
  };
  CHECK_BODY("malloc-aligned8") {
    bool ok = true;
    for (int i = 0; i < 5 && ok; i++) {
      int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
-      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
      mi_free(p);
    }
    result = ok;
@ -203,7 +203,7 @@ int main(void) {
  CHECK_BODY("malloc-aligned9") { // test large alignments
    bool ok = true;
    void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
    for (int i = 0; i < 28 && ok; i++) {
      int align = (1 << i);
      for (int j = 0; j < 8 && ok; j++) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -40,6 +40,19 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
 static int THREADS = 4;
 static int SCALE   = 10;
 static int ITER    = 20;
 #elif 0
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
 #elif 0
 static int THREADS = 32;
 static int SCALE   = 25;
 static int ITER    = 50;
 #define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 50;      // scaling factor
@ -50,7 +63,12 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 #define STRESS                // undefine for leak test
-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+#ifndef ALLOW_LARGE
 #define ALLOW_LARGE  false
 #endif
 static bool   allow_large_objects = ALLOW_LARGE;    // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 static bool   main_participates = false;       // main thread participates as a worker too
@ -66,7 +84,7 @@ static bool   main_participates = false;       // main thread participates as a
 #define custom_free(p)        mi_free(p)
 #ifndef NDEBUG
-#define HEAP_WALK             // walk the heap objects?
+#define xHEAP_WALK             // walk the heap objects?
 #endif
 #endif
@ -241,9 +259,21 @@ static void test_stress(void) {
    //mi_debug_show_arenas(true);
    #endif
    #if !defined(NDEBUG) || defined(MI_TSAN)
-    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+    if ((n + 1) % 10 == 0) {
      printf("- iterations left: %3d\n", ITER - (n + 1));
      mi_debug_show_arenas(true);
      //mi_collect(true);
      //mi_debug_show_arenas(true);
    }
    #endif
  }
  // clean up
  for (int i = 0; i < TRANSFERS; i++) {
    void* p = atomic_exchange_ptr(&transfer[i], NULL);
    if (p != NULL) {
      free_items(p);
    }
  }
 }
 #ifndef STRESS
@ -274,6 +304,10 @@ int main(int argc, char** argv) {
  #endif
  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
    //mi_option_set(mi_option_purge_delay,10);
  #endif
  #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
    // mi_option_set(mi_option_purge_delay,-1);
  #endif
  #ifndef USE_STD_MALLOC
    mi_stats_reset();
@ -318,7 +352,7 @@ int main(int argc, char** argv) {
  #ifndef NDEBUG
  mi_debug_show_arenas(true);
  mi_collect(true);
-  #endif  
+  #endif
 #endif
  mi_stats_print(NULL);
  //bench_end_program();
@ -341,9 +375,10 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
  thread_entry_fun = fun;
  DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
  HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
  thandles[0] = GetCurrentThread(); // avoid lint warning
  const size_t start = (main_participates ? 1 : 0);
  for (size_t i = start; i < nthreads; i++) {
-    thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
+    thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]);
  }
  if (main_participates) fun(0); // run the main thread as well
  for (size_t i = start; i < nthreads; i++) {
Author	SHA1	Message	Date
Daan Leijen	e394e340e4	Merge branch 'dev' into dev3	2025-01-03 18:11:11 -08:00
Daan Leijen	e14c8fc795	bump version to 3.0.0	2025-01-03 18:08:34 -08:00
Daan Leijen	07bf4eea26	merge from dev	2025-01-03 18:07:01 -08:00
Daan Leijen	c95d9865a8	merge from dev3-bin	2025-01-03 14:27:18 -08:00
Daan Leijen	03d816d7be	Merge branch 'dev3' into dev3-bin	2025-01-03 14:26:44 -08:00
Daan Leijen	6099f76c8c	nicer logic in free	2025-01-03 14:26:32 -08:00
daanx	b432f77bfc	Merge branch 'dev3' into dev3-bin	2025-01-03 13:50:37 -08:00
daanx	f6c2550eac	fix enable large pages	2025-01-03 13:50:31 -08:00
Daan Leijen	b6adbbca0c	combine flags and xthread_id	2025-01-03 13:15:46 -08:00
Daan Leijen	3c43225c1f	fix initialization warning on gcc	2025-01-03 08:51:02 -08:00
Daan Leijen	281a513642	fix initialization warning on gcc	2025-01-03 08:48:06 -08:00
Daan Leijen	bbd7a492f0	fix signedness warning	2025-01-03 08:46:30 -08:00
Daan Leijen	7e539cc353	Merge branch 'dev3' into dev3-bin	2025-01-03 08:38:45 -08:00
Daan Leijen	2a75500ac2	disable large pages by default	2025-01-03 08:38:36 -08:00
Daan Leijen	4e43ebb496	Merge branch 'dev' into dev3	2025-01-03 08:17:44 -08:00
Daan Leijen	53873df613	Merge branch 'dev3' into dev3-bin	2025-01-02 17:25:49 -08:00
Daan Leijen	211f11218e	merge from dev	2025-01-02 17:25:38 -08:00
Daan Leijen	9363900f75	Merge branch 'dev3-bin' of e:\dev\mimalloc3 into dev3-bin	2025-01-02 15:21:43 -08:00
daanx	ab78d57a84	search size bins from small to large	2025-01-02 15:19:08 -08:00
daanx	d25f714ff5	merge from dev3	2025-01-02 15:06:31 -08:00
daanx	d242e86e74	Merge branch 'dev' into dev3	2025-01-02 15:02:57 -08:00
daanx	34e402e128	fix NX test in try_find_and_clearN	2025-01-02 15:00:17 -08:00
daanx	10b40f90fc	fix scan of NX	2025-01-02 14:59:42 -08:00
Daan Leijen	44264b3d8b	Merge branch 'dev3-bin' of e:\dev\mimalloc3 into dev3-bin	2025-01-02 12:45:38 -08:00
daanx	670ebd0348	merge from dev3; make medium bin larger than other	2025-01-02 12:24:27 -08:00
daanx	5e26ba6fe6	fix debug output	2025-01-02 12:14:12 -08:00
daanx	3933ac9a3f	merge from dev3	2025-01-02 11:54:26 -08:00
daanx	c507ee3d96	make bitmap scan cross bfields for NX; disable the use of large object pages	2025-01-02 11:42:28 -08:00
daanx	ff52ea0553	Merge branch 'dev3' into dev3-bin	2024-12-31 15:11:24 -08:00
daanx	0d302cd174	add comments	2024-12-31 15:11:09 -08:00
Daan Leijen	84f2038a2c	Merge branch 'dev3' into dev3-bin	2024-12-31 14:28:25 -08:00
Daan Leijen	9665d604d3	merge from dev	2024-12-31 14:28:09 -08:00
Daan	9511d09529	add neon version for chunk all_set	2024-12-26 23:51:37 -08:00
Daan	dddcd5de16	add neon version for chunk_is_clear	2024-12-26 23:49:38 -08:00
Daan	82a8b2445e	Merge branch 'dev3' into dev3-bin	2024-12-26 23:12:11 -08:00
Daan	8a4c26377f	add neon code for bit clear	2024-12-26 23:12:03 -08:00
Daan	c9ab24899c	Merge branch 'dev3' into dev3-bin	2024-12-26 11:19:32 -08:00
Daan	e6d9011b9d	Merge branch 'dev' into dev3	2024-12-26 11:19:04 -08:00
daanx	e359e9b12b	merge from dev3	2024-12-26 10:43:10 -08:00
daanx	fb704834c4	Merge branch 'dev3' into dev3-bin	2024-12-26 10:42:35 -08:00
daanx	0a7fd7eb6f	use fixed tls on windows with static linking	2024-12-26 10:42:24 -08:00
daanx	807b5cd342	Merge branch 'dev3' into dev3-bin	2024-12-26 10:38:02 -08:00
daanx	8b6eb4752b	merge from dev, add decl_hidden for better codegen on page_map loading	2024-12-26 10:37:51 -08:00
daanx	f72ac7a5aa	add attr_noexept for better codegen on msvc	2024-12-26 10:28:36 -08:00
daanx	4c5bc125ab	Merge branch 'dev3' into dev3-bin	2024-12-26 10:25:03 -08:00
daanx	b70fd1093a	merge from dev	2024-12-26 10:24:56 -08:00
daanx	2aad74e0c3	Merge branch 'dev3' into dev3-bin	2024-12-26 10:15:38 -08:00
daanx	bec06cfb95	merge from dev	2024-12-26 10:15:08 -08:00
daanx	27e0c467ae	fix c++ initializer warning	2024-12-25 14:56:11 -08:00
Daan Leijen	76d50d4566	Merge branch 'dev3' into dev3-bin	2024-12-25 14:41:43 -08:00
Daan Leijen	efe10513ec	fix initializer warning on clang-18	2024-12-25 14:40:32 -08:00
daanx	a245135d89	Merge branch 'dev3' into dev3-bin	2024-12-25 14:12:52 -08:00
daanx	5f13941c18	fix constructor re-initialization on subproc_main	2024-12-25 14:12:45 -08:00
daanx	c65d5b878b	Merge branch 'dev3' into dev3-bin	2024-12-25 13:30:50 -08:00
daanx	7ae726bb39	small fixes	2024-12-25 13:30:42 -08:00
daanx	b5c4a3c6e7	merge from dev3	2024-12-25 11:47:54 -08:00
daanx	8339cefdeb	fix stats for delay purge commit	2024-12-25 11:45:01 -08:00
daanx	15061be4b2	commit page-map within one allocation	2024-12-25 10:50:49 -08:00
daanx	ce7eb4db7a	fix page commit-on-demand setting	2024-12-25 10:49:49 -08:00
daanx	5a663da9aa	fix build warning	2024-12-24 20:38:36 -08:00
daanx	e64d6fcc47	Merge branch 'dev3' into dev3-bin	2024-12-24 20:23:47 -08:00
daanx	24b8384f80	remove is_expandable requirement on page candidates	2024-12-24 20:23:37 -08:00
daanx	a65742fdf9	merge from dev3	2024-12-24 20:21:56 -08:00
daanx	8259c0eb7c	nice colors for heap maps	2024-12-24 20:10:44 -08:00
daanx	50d22cf092	Merge branch 'dev3' into dev3-bin	2024-12-24 17:15:01 -08:00
daanx	4d1d3471cf	rename page options	2024-12-24 17:14:53 -08:00
daanx	fe8e52cbcc	Merge branch 'dev3' into dev3-bin	2024-12-24 17:07:19 -08:00
daanx	1e1a12bf3c	fix rounding issue with huge size allocations	2024-12-24 17:07:11 -08:00
daanx	6f6190c8a9	Merge branch 'dev3' into dev3-bin	2024-12-24 16:40:02 -08:00
daanx	d862e57955	fix huge page allocation size	2024-12-24 16:39:54 -08:00
daanx	e078879825	Merge branch 'dev3' into dev3-bin	2024-12-24 15:00:14 -08:00
daanx	ad6f48f3e4	fix assertion for huge pages	2024-12-24 15:00:05 -08:00
Daan Leijen	431370df62	Merge branch 'dev3' into dev3-bin	2024-12-24 12:10:46 -08:00
Daan Leijen	016b36d917	fix max va bits on unix	2024-12-24 12:10:34 -08:00
Daan Leijen	71a1645d4d	fix build	2024-12-24 12:04:21 -08:00
daanx	7c331a967b	merge from dev3	2024-12-24 11:42:02 -08:00
daanx	d21114b5f2	improve page commit on demand	2024-12-24 11:37:52 -08:00
daanx	ba68810333	commit page on demand	2024-12-23 18:33:37 -08:00
daanx	9a7c0d443a	max obj size 1/8 of a page	2024-12-23 17:15:13 -08:00
daanx	b77b34df96	double arena per 4; large page objects 1/8 of large page size	2024-12-23 17:10:34 -08:00
daanx	3fa3476712	Merge branch 'dev3' into dev3-bin	2024-12-23 16:47:08 -08:00
daanx	9bad269c51	fix purge delay check for arenas	2024-12-23 16:47:01 -08:00
daanx	c65c6d83bd	fix guard page size	2024-12-23 16:31:42 -08:00
daanx	b515a0ad4c	add _mi_os_guard_page_size	2024-12-23 16:28:34 -08:00
daanx	88d8ee964f	remove is_large member (and use is_pinned for this)	2024-12-23 15:04:06 -08:00
daanx	657135de36	commit 2level page-map on over-commit systems	2024-12-23 09:53:52 -08:00
daanx	da2ab86e9f	Merge branch 'dev3' into dev3-bin	2024-12-22 22:31:26 -08:00
daanx	bc5ae31649	add abandoned_visit_blocks	2024-12-22 22:31:16 -08:00
daanx	04970f43e5	document way to use a TLS slot on windows	2024-12-22 21:55:40 -08:00
daanx	dd1b37c9f8	fix recursive tls access on macOS <= 14	2024-12-22 21:03:03 -08:00
daanx	8d2b7b0383	merge from dev3	2024-12-22 18:34:39 -08:00
daanx	36bf7dfc45	Merge branch 'dev3' into dev3-bin	2024-12-22 18:33:56 -08:00
daanx	f605cb73e5	old purge delay	2024-12-22 18:33:44 -08:00
daanx	823f5b7ecd	merge from dev3	2024-12-22 18:32:47 -08:00
daanx	e61ab67185	cleanup	2024-12-22 18:31:33 -08:00
daanx	1eea4309b6	Merge branch 'dev3' into dev3-bin	2024-12-22 18:09:27 -08:00
daanx	db82baf1a8	cleanup, some renaming	2024-12-22 18:09:16 -08:00
daanx	9ecadaecd5	clean up	2024-12-22 17:55:56 -08:00
daanx	b920fc1b72	merge from dev3	2024-12-22 17:38:48 -08:00
daanx	773fe7ae5b	support full secure build	2024-12-22 17:25:58 -08:00
daanx	516e644359	rename option pagemap_commit; always commit the page map on macos (for now)	2024-12-22 16:06:49 -08:00
daanx	6b97830f6a	merge from dev3	2024-12-22 14:40:46 -08:00
daanx	c5cfc92f0c	small fixes	2024-12-22 14:39:57 -08:00
daanx	a42a2a926b	improving level 2 page-map	2024-12-22 14:18:33 -08:00
daanx	3c7d7e1f11	experiment with 2 level pagemap	2024-12-22 14:07:57 -08:00
daanx	8d16303aa6	add -mtune=native with opt arch	2024-12-22 12:21:31 -08:00
daanx	93fa8d895a	revert back to flat address map	2024-12-22 12:18:53 -08:00
daanx	c9b2d31665	fix page_map initialization	2024-12-21 23:17:11 -08:00
daanx	56cbddfc7e	initial work on a two-level page-map	2024-12-21 23:08:52 -08:00
daanx	1e2221f512	fix signed/unsigned; fix heap_destroy assert failure	2024-12-21 19:28:53 -08:00
daanx	bfc498e54a	Merge branch 'dev3' into dev3-bin	2024-12-21 16:25:04 -08:00
daanx	d7d626cbfa	enable collecting from the full page queue	2024-12-21 16:24:56 -08:00
daanx	b991510813	merge from dev3	2024-12-21 15:56:22 -08:00
daanx	da17a59bdb	re-add deferred free and heap retired collect	2024-12-21 15:53:50 -08:00
daanx	5de5550c63	merge from dev3	2024-12-21 15:52:15 -08:00
daanx	c138fba149	merge from dev	2024-12-21 15:49:17 -08:00
daanx	1a6fbdf0b2	merge from dev	2024-12-21 15:48:49 -08:00
daanx	108c84e858	remove req_arena parameter to arena_reserve	2024-12-21 14:45:14 -08:00
daanx	7d46478a5f	add initial load/unload for heaps	2024-12-21 13:19:06 -08:00
daanx	89b0d5a357	allocate heaps associated with an arena in that arena	2024-12-21 11:53:29 -08:00
daanx	4ad7fedd25	track os abandoned pages in a list	2024-12-21 11:35:30 -08:00
daanx	95aeda4cdd	merge subproc stats on delete	2024-12-21 10:53:34 -08:00
daanx	dece8a587b	make stats part of a subproc	2024-12-21 10:43:08 -08:00
daanx	daac75af36	fix lock recursion	2024-12-20 22:13:58 -08:00
daanx	a5b7d7f264	subprocesses own arena's	2024-12-20 21:38:31 -08:00
daanx	53857ddaa3	Merge branch 'dev' into dev3	2024-12-20 17:32:32 -08:00
daanx	7141d9f164	remove busy wait for arena reservation	2024-12-20 17:31:48 -08:00
daanx	bc459b5e16	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-20 16:46:18 -08:00
Daan Leijen	278f1ff556	merge from dev; match test-stress	2024-12-20 14:00:02 -08:00
daanx	b2d1b4c472	Merge branch 'dev3-bin' of https://github.com/microsoft/mimalloc into dev3-bin	2024-12-20 13:10:55 -08:00
daanx	efa82e1c7d	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-20 13:10:16 -08:00
Daan Leijen	f0f4c9c009	Merge branch 'dev3' into dev3-bin	2024-12-20 13:07:00 -08:00
Daan Leijen	7822438561	merge from dev	2024-12-20 13:06:46 -08:00
Daan Leijen	4322546a9b	Merge branch 'dev3' into dev3-bin	2024-12-20 13:01:09 -08:00
Daan Leijen	f6408235f7	merge from dev	2024-12-20 13:01:00 -08:00
Daan Leijen	13a58ac343	Merge branch 'dev3' into dev3-bin	2024-12-20 11:56:16 -08:00
Daan Leijen	5614c5052e	don't prefer high used candidate if it is too full	2024-12-20 11:56:04 -08:00
Daan Leijen	2db407d1e9	revert back to generating mimalloc.dll instead of mimalloc-override.dll	2024-12-20 11:54:39 -08:00
daanx	3746bf79ed	small fixes; max object size 1/8th of a pages	2024-12-19 21:30:03 -08:00
daanx	9a4c264e76	Merge branch 'dev3' into dev3-bin	2024-12-19 19:18:10 -08:00
daanx	de8001c107	add specialized is_set for 1 bit	2024-12-19 19:18:04 -08:00
daanx	8dd605099b	fix arm64ec asm	2024-12-19 15:29:40 -08:00
daanx	02b59e0f15	Merge branch 'dev3' into dev3-bin	2024-12-19 11:01:12 -08:00
daanx	b18e1546a7	merge from dev	2024-12-18 15:59:33 -08:00
daanx	2d679959b7	Merge branch 'dev3' into dev3-bin	2024-12-17 19:13:14 -08:00
daanx	264d5a6704	update stat adjustment for purging	2024-12-17 19:13:03 -08:00
daanx	fb90938408	adjust stats more clearly to avoid double counting commits	2024-12-17 19:11:23 -08:00
daanx	2a3969ffc7	Merge branch 'dev3' into dev3-bin	2024-12-17 18:57:20 -08:00
Daan Leijen	58b726be6f	better stats for commit on overcommit systems (by not counting on-demand commit upfront)	2024-12-17 18:57:00 -08:00
daanx	587eabe72b	Merge branch 'dev3' into dev3-bin	2024-12-17 18:10:37 -08:00
daanx	84bb1c2712	adjust stats more clearly to avoid double counting commits	2024-12-17 18:10:28 -08:00
daanx	21c05019b7	Merge branch 'dev' into dev3	2024-12-17 17:54:24 -08:00
daanx	34d03f3981	atomically clear purge bits when visiting	2024-12-17 12:32:18 -08:00
daanx	6e2a64b81e	merge from dev3	2024-12-17 11:58:02 -08:00
daanx	c585753dce	fix purging with ranges	2024-12-17 11:54:26 -08:00
daanx	68a90ceb9a	add ranges for purging	2024-12-17 11:44:14 -08:00
daanx	adfeb1f6f2	fix bug in bitmap_forall_ranges	2024-12-17 10:43:31 -08:00
daanx	fdad1a0d4f	fix infoslices needed calculation	2024-12-17 09:49:09 -08:00
Daan Leijen	98171fd80a	testing on arm64	2024-12-17 00:24:32 -08:00
Daan Leijen	d4a2813ff8	Merge branch 'dev3' into dev3-bin	2024-12-17 00:17:32 -08:00
Daan Leijen	63d0c8f861	merge from dev	2024-12-17 00:14:03 -08:00
daanx	d9397be178	comments	2024-12-16 10:00:32 -08:00
daanx	037cb167f8	comments	2024-12-16 09:51:54 -08:00
daanx	d2f670e6e5	add delay to purg'ing; call collect_retired every N generic allocs	2024-12-15 19:54:01 -08:00
daanx	3330d4353a	remove maxaccessed from general bitmaps	2024-12-15 19:15:00 -08:00
daanx	e24217e69c	more bbin size classes, bug fixes	2024-12-15 18:35:12 -08:00
daanx	df9009a060	wip: binned bitmap for the free slices	2024-12-15 17:15:56 -08:00
daanx	3153e5a4c5	small fixes	2024-12-15 13:47:33 -08:00
daanx	13ee94cef6	fix concurrent mi_tld access bug	2024-12-15 13:22:00 -08:00
daanx	4aeb2e1005	flexible clearN_ that can start at any index	2024-12-15 13:21:13 -08:00
daanx	b5dfd233e9	fix avx2 bug with atomics	2024-12-13 19:59:08 -08:00
daanx	216c04f8d9	clean up bitmap api	2024-12-13 18:39:03 -08:00
daanx	4c81c3cf90	enable purging of free committed slices from arenas	2024-12-13 13:17:00 -08:00
daanx	42af184ce9	wip: start on purge	2024-12-13 09:04:23 -08:00
daanx	ba39e4d65b	wip: start on purge	2024-12-13 09:03:17 -08:00
Daan	3010d5890f	fix assertion	2024-12-12 20:27:46 -08:00
daanx	e43eb1f191	nicer debug output	2024-12-12 20:22:24 -08:00
daanx	b53ac835f1	comment	2024-12-12 20:01:37 -08:00
daanx	623eaedf33	add debug output for page map; free tld on thread exit	2024-12-12 19:59:54 -08:00
daanx	637de624b3	fix free bug for meta data	2024-12-12 19:55:45 -08:00
daanx	d5c4a16e58	lower full page retain more aggressively in a threadpool	2024-12-12 17:57:36 -08:00
daanx	df956c4a17	use thread spacing for reclaim as well	2024-12-12 17:22:41 -08:00
daanx	98879ac8bc	use thread spacing for reclaim as well	2024-12-12 17:22:00 -08:00
daanx	118bd8c97f	space out threads when searching for free pages	2024-12-12 16:37:31 -08:00
daanx	94ce342ea9	maintain pages set for arenas; improve arena load/unload	2024-12-11 22:06:25 -08:00
daanx	aed76f2910	wip: allow arena (re)loading	2024-12-11 20:34:23 -08:00
daanx	ccf5e36e6b	use frac 8 for reclaim_on_free and reabandon; halve full_page_retain if running in a threadpool	2024-12-11 16:26:39 -08:00
daanx	1c8d15abac	fix build error	2024-12-11 14:30:44 -08:00
daanx	ab53a73cbd	small updates	2024-12-11 14:29:06 -08:00
daanx	565656919e	fix comments in types; fix guarded alignment bug	2024-12-11 13:04:37 -08:00
daanx	64eea823e4	use always abandon on heap delete	2024-12-11 09:24:38 -08:00
daanx	24d3c1bc14	heap meta data always uses mi_meta_zalloc	2024-12-11 09:16:28 -08:00
daanx	6774130c9a	Merge ..\mimalloc into dev3	2024-12-10 20:46:12 -08:00
daanx	64c4181ffa	better block alignment	2024-12-10 20:32:48 -08:00
daanx	c478ddaab4	fix MI_GUARDED build	2024-12-10 19:44:54 -08:00
daanx	2a1c346281	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-10 15:12:13 -08:00
Daan	13be5d6740	use non-null tld in heap_init	2024-12-10 15:11:46 -08:00
daanx	7cd8f31f30	improve popcount	2024-12-10 14:50:55 -08:00
Daan	f37aff6ee2	fix for macOS 14 and earlier	2024-12-09 22:27:40 -08:00
Daan	6798375f47	temporarily add macOS 13 and 12 for testing	2024-12-09 21:26:23 -08:00
Daan	5e434a6e66	merge from dev	2024-12-09 21:24:30 -08:00
daanx	c5a2d11193	add extra checks for valid pointers in the pagemap, add max_vabits and debug_commit_full_pagemap options	2024-12-09 20:40:26 -08:00
daanx	3a92c35270	improve generic ctz/clz	2024-12-09 20:25:22 -08:00
daanx	e44815ed6f	add bsf/bsr for compilation with older compilers (clang 7)	2024-12-09 20:06:48 -08:00
daanx	56a1bd7f9e	fix 32 bit multiply in generic ctz/clz	2024-12-09 19:43:00 -08:00
daanx	f28d5c7029	add cast to avoid errors on clang 7	2024-12-09 19:12:03 -08:00
daanx	bbcbd3cd1f	add cast to avoid errors on clang 7	2024-12-09 19:06:06 -08:00
Daan	3f732a981f	fix debug build of MI_GUARDED	2024-12-09 15:49:20 -08:00
Daan	8f5449d271	various fixes for test pipeline	2024-12-09 15:39:15 -08:00
Daan	351cb0c740	small fixes for macOS	2024-12-09 15:16:36 -08:00
daanx	d5ed0cc71e	various improvements	2024-12-09 14:31:43 -08:00
daanx	68ac94c1ba	set default arena reserve back to 1GiB	2024-12-08 18:53:43 -08:00
daanx	bf2f2a8bf4	fix bug where only the first chunkmap field would be considered	2024-12-08 18:48:56 -08:00
daanx	88990cec2d	merge from dev	2024-12-08 18:27:05 -08:00
daanx	2a4af6f169	comments	2024-12-08 17:21:17 -08:00
daanx	2084df3dde	add dedicated meta data allocation for threads and tld	2024-12-08 12:20:54 -08:00
daanx	67cc424ada	delete old files	2024-12-08 09:19:05 -08:00
daanx	36bb599873	merge from dev	2024-12-08 09:15:09 -08:00
daanx	2ed6e03d27	update optimization on haswell	2024-12-08 09:14:16 -08:00
daanx	e446bc27e5	Merge ..\mimalloc into dev3	2024-12-08 09:03:33 -08:00
daanx	5a06d2aeba	update bit primitives	2024-12-08 09:03:25 -08:00
daanx	c33de86da3	check for running in a threadpool to disable page reclaim	2024-12-07 17:11:11 -08:00
daanx	d0c86f3f0e	specialize bitmap operations for common page sizes	2024-12-07 16:26:07 -08:00
daanx	bf42759d97	check heaptag on abandonded page allocation	2024-12-07 15:13:17 -08:00
daanx	6b52b19e3b	arch specific optimizations	2024-12-07 15:02:27 -08:00
daanx	0e5d5831e4	Merge ..\mimalloc into dev3	2024-12-07 14:17:05 -08:00
daanx	bef52b96f6	Merge ../mimalloc into dev3	2024-12-07 14:04:02 -08:00
daanx	9631b0d4d2	revise visiting arenas, better bitmap scanning	2024-12-07 14:03:51 -08:00
daanx	70115d8b8c	small fixes	2024-12-06 23:25:53 -08:00
daanx	bf9a2ddb59	compile for 32-bit as well	2024-12-06 23:07:10 -08:00
daanx	659a9dd51d	fix page info size and order; atomic page flags	2024-12-06 22:37:59 -08:00
daanx	5a5943ad33	record max_clear bit	2024-12-06 21:03:33 -08:00
daanx	61436a92b9	working simplified version without pairmaps and bitmap epoch	2024-12-06 15:26:01 -08:00
daanx	ec9c61c066	initial no more pairmap	2024-12-06 14:53:24 -08:00
daanx	7443ee317e	tune free-ing and abandoning	2024-12-05 17:00:23 -08:00
daanx	0616ee151e	change to full_page_retain	2024-12-05 11:29:25 -08:00
daanx	bc67be4d79	small adjustments	2024-12-04 21:40:57 -08:00
daanx	afe9089152	more documentation; better pairmap find_and_set_to_busy, busy flag is now 0x10	2024-12-04 19:15:55 -08:00
daanx	45f7fb559a	small fixes	2024-12-04 00:14:56 -08:00
daanx	bc7fe399b1	large bitmaps working; lock on arena_reserve	2024-12-03 23:35:33 -08:00
daanx	e5fdd6e110	wip: initial large bitmaps	2024-12-03 22:43:14 -08:00
daanx	8d9c725482	increase MAX_OBJ_SLICES to a full chunk (32MiB)	2024-12-03 17:27:43 -08:00
daanx	3fc2c8e279	fix assertions	2024-12-03 11:06:07 -08:00
daanx	666c089fc8	revise free reclaim; ensure unown cannot race with a free	2024-12-03 10:51:13 -08:00
daanx	833b091ff9	can run the full test suite	2024-12-02 20:25:44 -08:00
daanx	bd5f7de3f4	can run basic test	2024-12-02 20:21:35 -08:00
daanx	fe5a314114	add base and size to OS memid	2024-12-02 19:31:36 -08:00
daanx	5e95ebc7a0	fix free stats	2024-12-02 17:46:41 -08:00
daanx	c9abfe8253	wip: can run mstress	2024-12-02 16:24:40 -08:00
daanx	d96c134566	wip: initial version with eager abandonment	2024-12-02 16:01:45 -08:00
daanx	69ac69abac	wip: use epoch with 512bit chunks	2024-12-02 00:31:08 -08:00
daanx	2f789aae9a	wip: cannot compile	2024-12-01 16:26:59 -08:00
daanx	1d7a9f62a5	bug fixes	2024-12-01 12:54:16 -08:00
daanx	8f2a5864b8	pass all debug tests	2024-11-30 22:54:57 -08:00
daanx	9ebe941ce0	first version that passes the make test	2024-11-30 20:21:32 -08:00
daanx	55b70f1588	wip	2024-11-30 14:00:07 -08:00
daanx	f8d04dc2bc	compile with clang and gcc	2024-11-30 12:41:11 -08:00
daanx	d15e83030e	wip: rename arena blocks to slices	2024-11-30 12:16:41 -08:00
daanx	309fc26b4b	wip: add generic find_and_xset	2024-11-30 12:00:30 -08:00
daanx	188294a0df	wip: bug fixes	2024-11-30 11:12:39 -08:00
daanx	9d904e8643	wip: bug fixes	2024-11-30 10:39:30 -08:00
daanx	978d844e15	wip: bug fixes	2024-11-29 20:23:39 -08:00
daanx	0f635413d6	wip: can run initial test	2024-11-29 17:50:37 -08:00
daanx	e0152ab82f	wip: update any_set	2024-11-29 16:58:52 -08:00
daanx	9603fe8b50	can compile without missing functions	2024-11-29 16:27:58 -08:00
daanx	68f5fb2f4b	wip: further progress on segment removal; arena allocation	2024-11-29 15:08:06 -08:00
daanx	46afcbe06c	wip: further progress on segment removal; arena allocation	2024-11-29 14:28:34 -08:00
daanx	441d4fed9f	wip: further progress on removing segments	2024-11-29 10:40:18 -08:00
daanx	71cfa45e76	wip: initial work on mimalloc3 without segments	2024-11-28 19:31:04 -08:00