Merge branch 'dev' into dev3

bump version to 3.0.0
merge from dev
2025-05-06 23:39:31 +03:00 · 2025-01-03 18:11:11 -08:00 · 2025-01-03 18:08:34 -08:00 · 2025-01-03 18:07:01 -08:00 · 2025-01-03 14:27:18 -08:00 · 2025-01-03 14:26:44 -08:00
44 changed files with 6707 additions and 5012 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,25 +10,30 @@ option(MI_PADDING           "Enable padding to detect heap block overflow (alway
 option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
-option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
-option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
-option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
-option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON)
+
+option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
+option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
 option(MI_WIN_USE_FIXED_TLS "Use a fixed TLS slot on Windows to avoid extra tests in the malloc fast path" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
-option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
+option(MI_LIBC_MUSL         "Enable this when linking with musl libc" OFF)
+
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
+option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
+option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
-option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
-option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
+
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
@ -50,6 +55,7 @@ set(mi_sources
    src/alloc-aligned.c
    src/alloc-posix.c
    src/arena.c
+    src/arena-meta.c
    src/bitmap.c
    src/heap.c
    src/init.c
@ -57,9 +63,8 @@ set(mi_sources
    src/options.c
    src/os.c
    src/page.c
+    src/page-map.c
    src/random.c
-    src/segment.c
-    src/segment-map.c
    src/stats.c
    src/prim/prim.c)

@ -122,8 +127,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
  if (NOT MI_OPT_ARCH)
    message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
  endif()
-else()
-  set(MI_OPT_ARCH OFF)
+#else()
+#  set(MI_OPT_ARCH OFF)
 endif()

 if(MI_OVERRIDE)
@ -227,7 +232,7 @@ endif()
 if(MI_SEE_ASM)
  message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
  list(APPEND mi_cflags -save-temps)
-  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14)
    message(STATUS "No GNU Line marker")
    list(APPEND mi_cflags -Wno-gnu-line-marker)
  endif()
@ -398,21 +403,28 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
      list(APPEND mi_cflags -ftls-model=initial-exec)
    endif()
  endif()
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
  if(MI_OVERRIDE)
    list(APPEND mi_cflags -fno-builtin-malloc)
  endif()
  if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native")    # fast bit scan (since 2013)
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native")         # fast atomics (since 2016)
    endif()
  endif()
 endif()

-if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
  list(APPEND mi_cflags /Zc:__cplusplus)
  if(MI_OPT_ARCH)
-    if(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "/arch:AVX2")
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")
    endif()
  endif()
 endif()
@ -424,6 +436,12 @@ endif()
 if(MI_OPT_ARCH_FLAGS)
  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
+  if (MI_OPT_SIMD)
+    list(APPEND mi_defines "MI_OPT_SIMD=1")
+    message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)")
+  endif()
+elseif(MI_OPT_SIMD)
+  message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)")
 endif()

 # extra needed libraries
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -306,3 +306,28 @@ jobs:
  - script: ctest --verbose --timeout 240
    workingDirectory: $(BuildType)
    displayName: CTest
+
+- job:
+  displayName: macOS 13 (Ventura)
+  pool:
+    vmImage:
+      macOS-13
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 180
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -1,6 +1,6 @@
-set(mi_version_major 1)
-set(mi_version_minor 8)
-set(mi_version_patch 8)
+set(mi_version_major 3)
+set(mi_version_minor 0)
+set(mi_version_patch 0)
 set(mi_version ${mi_version_major}.${mi_version_minor})

 set(PACKAGE_VERSION ${mi_version})
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@ -431,12 +431,11 @@ int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large);
 /// @param start       Start of the memory area
 /// @param size        The size of the memory area.
 /// @param is_committed Is the area already committed?
-/// @param is_large    Does it consist of large OS pages? Set this to \a true as well for memory
-///                    that should not be decommitted or protected (like rdma etc.)
+/// @param is_pinned   Can the memory not be decommitted or reset? (usually the case for large OS pages)
 /// @param is_zero     Does the area consists of zero's?
 /// @param numa_node   Possible associated numa node or `-1`.
 /// @return \a true if successful, and \a false on error.
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node);
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node);

 /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes,
 /// but stops after at most `timeout_msecs` seconds.
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@ -308,6 +308,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -421,16 +422,7 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c" />
    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\bitmap.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
@ -450,6 +442,7 @@
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -474,8 +467,6 @@
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\os.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
@ -484,6 +475,7 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
+    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
--- a/ide/vs2022/mimalloc-lib.vcxproj.filters
+++ b/ide/vs2022/mimalloc-lib.vcxproj.filters
@ -16,9 +16,6 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandoned.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\bitmap.c">
      <Filter>Sources</Filter>
    </ClCompile>
@ -55,15 +52,15 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\page-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\include\mimalloc\atomic.h">
@ -93,6 +90,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\bits.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Headers">
--- a/ide/vs2022/mimalloc-override-dll.vcxproj
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj
@ -404,11 +404,10 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="..\..\include\mimalloc-etw-gen.h" />
-    <ClInclude Include="..\..\include\mimalloc-etw.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
+    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
@ -438,7 +437,10 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena-meta.c" />
+    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.c" />
+    <ClCompile Include="..\..\src\free.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
@ -448,11 +450,10 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -478,13 +479,8 @@
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\include\mimalloc-etw-gen.man" />
-  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
--- a/ide/vs2022/mimalloc-override-dll.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj.filters
@ -16,9 +16,6 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandoned.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\bitmap.c">
      <Filter>Sources</Filter>
    </ClCompile>
@ -52,15 +49,18 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\page-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\include\mimalloc\atomic.h">
@ -75,12 +75,6 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
      <Filter>Headers</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\include\mimalloc-etw.h">
-      <Filter>Headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\include\mimalloc-etw-gen.h">
-      <Filter>Headers</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
      <Filter>Headers</Filter>
    </ClInclude>
@ -96,6 +90,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\bits.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Headers">
@ -105,9 +102,4 @@
      <UniqueIdentifier>{94b40bdc-a741-45dd-81aa-c05fabcd2970}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\include\mimalloc-etw-gen.man">
-      <Filter>Sources</Filter>
-    </None>
-  </ItemGroup>
 </Project>
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 188   // major + 2 digits minor
+#define MI_MALLOC_VERSION 300   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -274,16 +274,16 @@ mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;

 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;

-mi_decl_export void mi_debug_show_arenas(bool show_inuse) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept;

 // Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
+typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;

 #if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
@ -317,6 +317,23 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max);


+// experimental
+//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size);
+//mi_decl_export void  mi_os_free(void* p, size_t size);
+//mi_decl_export void  mi_os_commit(void* p, size_t size);
+//mi_decl_export void  mi_os_decommit(void* p, size_t size);
+
+mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
+mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id);
+mi_decl_export bool  mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);
+mi_decl_export void  mi_heap_unload(mi_heap_t* heap);
+
+// Is a pointer contained in the given arena area?
+mi_decl_export bool  mi_arena_contains(mi_arena_id_t arena_id, const void* p);
+
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -369,7 +386,6 @@ typedef enum mi_option_e {
  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
  mi_option_purge_extend_delay,
-  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
  mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
@ -379,6 +395,12 @@ typedef enum mi_option_e {
  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
  mi_option_target_segments_per_thread, // experimental (=0)
+  mi_option_reclaim_on_free,            // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_page_full_retain,           // retain N full pages per size class (=2)
+  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
+  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
+  mi_option_page_commit_on_demand,      // commit page memory on-demand
  _mi_option_last,
  // legacy option names
  mi_option_large_os_pages = mi_option_allow_large_os_pages,
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
+#define MI_ATOMIC_H

 // include windows.h or pthreads.h
 #if defined(_WIN32)
@ -75,16 +75,21 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))

 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))

 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
@ -405,10 +410,9 @@ static inline void mi_atomic_yield(void) {


 // ----------------------------------------------------------------------
-// Locks 
-// These do not have to be recursive and should be light-weight 
-// in-process only locks. Only used for reserving arena's and to 
-// maintain the abandoned list.
+// Locks
+// These should be light-weight in-process only locks.
+// Only used for reserving arena's and to maintain the abandoned list.
 // ----------------------------------------------------------------------
 #if _MSC_VER
 #pragma warning(disable:26110)  // unlock with holding lock
@ -534,4 +538,4 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 #endif


-#endif // __MIMALLOC_ATOMIC_H
+#endif // MI_ATOMIC_H
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@ -0,0 +1,336 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
+---------------------------------------------------------------------------- */
+
+#pragma once
+#ifndef MI_BITS_H
+#define MI_BITS_H
+
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if (INTPTR_MAX) > LONG_MAX
+# define MI_PU(x)  x##ULL
+#else
+# define MI_PU(x)  x##UL
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits in size
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+#else
+# define MI_ZU(x)  x##UL
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+/* --------------------------------------------------------------------------------
+  Architecture
+-------------------------------------------------------------------------------- */
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)  // consider arm64ec as arm64
+#define MI_ARCH_ARM64     1
+#elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define MI_ARCH_X64       1
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
+#define MI_ARCH_X86       1
+#elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
+#define MI_ARCH_ARM32     1
+#elif defined(__riscv) || defined(_M_RISCV)
+#define MI_ARCH_RISCV     1
+#if (LONG_MAX == INT32_MAX)
+#define MI_ARCH_RISCV32   1
+#else
+#define MI_ARCH_RISCV64   1
+#endif
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__)
+#include <immintrin.h>
+#elif MI_ARCH_ARM64 && MI_OPT_SIMD
+#include <arm_neon.h>
+#endif
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+#include <intrin.h>
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__) && !defined(__BMI2__) // msvc
+#define __BMI2__  1
+#endif
+#if MI_ARCH_X64 && (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
+#define __BMI1__  1
+#endif
+
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+// maximum virtual address bits in a user-space pointer
+#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
+#define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#elif   MI_ARCH_X64
+#define MI_MAX_VABITS     (47)
+#elif MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#else
+#define MI_MAX_VABITS     (32)
+#endif
+
+// use a flat page-map (or a 2-level one)
+#ifndef MI_PAGE_MAP_FLAT
+#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) 
+#define MI_PAGE_MAP_FLAT  1
+#else
+#define MI_PAGE_MAP_FLAT  0
+#endif
+#endif
+
+
+/* --------------------------------------------------------------------------------
+  Builtin's
+-------------------------------------------------------------------------------- */
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+#define mi_builtin(name)        __builtin_##name
+#define mi_has_builtin(name)    __has_builtin(__builtin_##name)
+
+#if (LONG_MAX == INT32_MAX)
+#define mi_builtin32(name)       mi_builtin(name##l)
+#define mi_has_builtin32(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin32(name)       mi_builtin(name)
+#define mi_has_builtin32(name)   mi_has_builtin(name)
+#endif
+#if (LONG_MAX == INT64_MAX)
+#define mi_builtin64(name)       mi_builtin(name##l)
+#define mi_has_builtin64(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin64(name)       mi_builtin(name##ll)
+#define mi_has_builtin64(name)   mi_has_builtin(name##ll)
+#endif
+
+#if (MI_SIZE_BITS == 32)
+#define mi_builtinz(name)        mi_builtin32(name)
+#define mi_has_builtinz(name)    mi_has_builtin32(name)
+#define mi_msc_builtinz(name)    name
+#elif (MI_SIZE_BITS == 64)
+#define mi_builtinz(name)        mi_builtin64(name)
+#define mi_has_builtinz(name)    mi_has_builtin64(name)
+#define mi_msc_builtinz(name)    name##64
+#endif
+
+/* --------------------------------------------------------------------------------
+  Popcount and count trailing/leading zero's
+-------------------------------------------------------------------------------- */
+
+size_t _mi_popcount_generic(size_t x);
+
+static inline size_t mi_popcount(size_t x) {
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
+
+size_t _mi_clz_generic(size_t x);
+size_t _mi_ctz_generic(size_t x);
+
+static inline size_t mi_ctz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif MI_HAS_FAST_POPCOUNT
+    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+static inline size_t mi_clz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+    size_t r;
+    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return (MI_SIZE_BITS - 1 - r);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+#ifndef MI_HAS_FAST_BITSCAN
+#define MI_HAS_FAST_BITSCAN 1
+#endif
+
+/* --------------------------------------------------------------------------------
+  find trailing/leading zero  (bit scan forward/reverse)
+-------------------------------------------------------------------------------- */
+
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
+    return !is_zero;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
+  #endif
+}
+
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
+    return !is_zero;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
+  #endif
+}
+
+
+/* --------------------------------------------------------------------------------
+  rotate
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_rotr(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateright64)(x,r);
+  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateright32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotr64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotr(x,(int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline size_t mi_rotl(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateleft64)(x,r);
+  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotl64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
+
+#endif // MI_BITS_H
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_PRIM_H
-#define MIMALLOC_PRIM_H
+#ifndef MI_PRIM_H
+#define MI_PRIM_H


 // --------------------------------------------------------------------------
@ -117,7 +117,8 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);

-
+// Is this thread part of a thread pool?
+bool _mi_prim_thread_is_in_threadpool(void);



@ -269,35 +270,42 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce


 // defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  const mi_threadid_t tid = __mi_prim_thread_id();
+  mi_assert_internal(tid > 1);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
+  return tid;
+}

 // Get a unique id for the current thread.
 #if defined(MI_PRIM_THREAD_ID)

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
 }

 #elif defined(_WIN32)

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
 }

 #elif MI_USE_BUILTIN_THREAD_POINTER

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Works on most Unix based platforms with recent compilers
  return (uintptr_t)__builtin_thread_pointer();
 }

 #elif MI_HAS_TLS_SLOT

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  #if defined(__BIONIC__)
    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
@ -313,7 +321,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 #else

 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return (uintptr_t)&_mi_heap_default;
 }

@ -416,4 +424,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()


-#endif  // MIMALLOC_PRIM_H
+#endif  // MI_PRIM_H
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TRACK_H
-#define MIMALLOC_TRACK_H
+#ifndef MI_TRACK_H
+#define MI_TRACK_H

 /* ------------------------------------------------------------------------------------------------------
 Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
@ -142,4 +142,4 @@ defined, undefined, or not accessible at all:
  }
 #endif

-#endif
+#endif // MI_TRACK_H
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -5,17 +5,15 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TYPES_H
-#define MIMALLOC_TYPES_H
+#ifndef MI_TYPES_H
+#define MI_TYPES_H

 // --------------------------------------------------------------------------
 // This file contains the main type definitions for mimalloc:
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
-// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
-//                  where objects are allocated.
+//                  where objects of a single size are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
@ -23,11 +21,9 @@ terms of the MIT license. A copy of the license can be found in the file

 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "atomic.h"   // _Atomic
-
-#ifdef _MSC_VER
-#pragma warning(disable:4214) // bitfield is not int
-#endif
+#include <errno.h>    // error codes
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
+#include "atomic.h"   // _Atomic primitives

 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `sizeof(void*)`
@ -50,11 +46,17 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1

-// Define MI_SECURE to enable security mitigations
-// #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 2  // guard page around each mimalloc page
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
+// but protects most metadata with guard pages:
+//   #define MI_SECURE 1  // guard page around metadata
+//
+// Level 2 has more performance impact but protect well against various buffer overflows
+// by surrounding all mimalloc pages with guard pages:
+//   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large heaps..)
+//
+// The next two levels can have more performance cost:
+//   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//   #define MI_SECURE 4  // checks for double free. (may be more expensive)

 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@ -97,124 +99,130 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif

+// Enable large pages for objects between 128KiB and 512KiB. Disabled by default.
+#ifndef MI_ENABLE_LARGE_PAGES
+#define MI_ENABLE_LARGE_PAGES  0
+#endif

-// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
-// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
-// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
-// the owning thread).
-// #define MI_HUGE_PAGE_ABANDON 1
+// --------------------------------------------------------------
+// Sizes of internal data-structures
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
+// --------------------------------------------------------------

-
-// ------------------------------------------------------
-// Platform specific values
-// ------------------------------------------------------
-
-// ------------------------------------------------------
-// Size of a pointer.
-// We assume that `sizeof(void*)==sizeof(intptr_t)`
-// and it holds for all platforms we know of.
-//
-// However, the C standard only requires that:
-//  p == (void*)((intptr_t)p))
-// but we also need:
-//  i == (intptr_t)((void*)i)
-// or otherwise one might define an intptr_t type that is larger than a pointer...
-// ------------------------------------------------------
-
-#if INTPTR_MAX > INT64_MAX
-# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
-#elif INTPTR_MAX == INT64_MAX
-# define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == INT32_MAX
-# define MI_INTPTR_SHIFT (2)
+// Sizes are for 64-bit
+#ifndef MI_ARENA_SLICE_SHIFT
+#ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
+#define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
-#error platform pointers must be 32, 64, or 128 bits
+#define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
+#endif
+#endif
+#ifndef MI_BCHUNK_BITS_SHIFT
+#define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif

-#if SIZE_MAX == UINT64_MAX
-# define MI_SIZE_SHIFT (3)
-typedef int64_t  mi_ssize_t;
-#elif SIZE_MAX == UINT32_MAX
-# define MI_SIZE_SHIFT (2)
-typedef int32_t  mi_ssize_t;
-#else
-#error platform objects must be 32 or 64 bits
-#endif
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps are "bchunks" of 512 bits
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)

-#if (SIZE_MAX/2) > LONG_MAX
-# define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
-#else
-# define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
-#endif
+#define MI_ARENA_MIN_OBJ_SLICES           (1)                         
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)            // 32 MiB (for now, cannot cross chunk boundaries)

-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)

-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)

-#define MI_KiB     (MI_ZU(1024))
-#define MI_MiB     (MI_KiB*MI_KiB)
-#define MI_GiB     (MI_MiB*MI_KiB)
-
-
-// ------------------------------------------------------
-// Main internal data-structures
-// ------------------------------------------------------
-
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit, divide by two for 32-bit
-#ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
-#endif
-#ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
-#endif
-#ifndef MI_LARGE_PAGE_SHIFT
-#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#endif
-#ifndef MI_SEGMENT_SHIFT
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
-#endif
-
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
-
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
-
-#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
-#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
-#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
-
-// The max object size are checked to not waste more than 12.5% internally over the page sizes.
-// (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)

 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+#define MI_BIN_COUNT (MI_BIN_FULL+1)

-#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
-
-// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
-
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)

 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+// Minimal commit for a page on-demand commit (should be >= OS page size)
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE // (4*MI_KiB) 
+
+// ------------------------------------------------------
+// Arena's are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_ARENA_SLICE_SIZE slices of memory for the
+// mimalloc pages.
+// ------------------------------------------------------
+
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;     // defined in `arena.c`
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
+// The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main heap data for example (`init.c`))
+  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case) (`arena.c`)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
+  return (memkind <= MI_MEM_STATIC);
+}
+
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  mi_arena_t*   arena;              // arena that contains this memory
+  uint32_t      slice_index;        // slice index in the arena
+  uint32_t      slice_count;        // allocated slices
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_meta_info {
+  void*         meta_page;          // meta-page that contains the block
+  uint32_t      block_index;        // block index in the meta-data page
+  uint32_t      block_count;        // allocated blocks
+} mi_memid_meta_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
+  } mem;
+  mi_memkind_t  memkind;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+} mi_memid_t;
+
+
+static inline bool mi_memid_is_os(mi_memid_t memid) {
+  return mi_memkind_is_os(memid.memkind);
+}
+
+static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
+  return mi_memkind_needs_no_free(memid.memkind);
+}

 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@ -232,48 +240,28 @@ typedef struct mi_block_s {
  mi_encoded_t next;
 } mi_block_t;

-#if MI_GUARDED
-// we always align guarded pointers in a block at an offset
-// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
-#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
-#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
-#endif

+// The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`)
+// `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
+// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
+#define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
+#define MI_PAGE_IS_ABANDONED_MAPPED   MI_ZU(0x04)
+#define MI_PAGE_FLAG_MASK             MI_ZU(0x07)
+typedef size_t mi_page_flags_t;

-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abandoned pages without a owning heap; this only resets on page reclaim
-} mi_delayed_t;
-
-
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint32_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif

 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// Points to a list of blocks that are freed by other threads.
+// The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// Ownership is required before we can read any non-atomic fields in the page.
+// This way we can push a block on the thread free list and try to claim ownership
+// atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;

+// A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
+typedef uint8_t mi_heaptag_t;
+
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
 // `free` for blocks that can be allocated,
@ -291,160 +279,93 @@ typedef uintptr_t mi_thread_free_t;
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
 // Notes:
-// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`).
+// - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
+//   that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that
+//   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
+// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 12 words on 32-bit, and encoded free lists add 2 words)
-// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
-//   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
-//   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to
-//   the owning heap `thread_delayed_free` list. This guarantees that pages
-//   will be freed correctly even if only other threads free blocks.
+
 typedef struct mi_page_s {
-  // "owned" by the segment
-  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
-  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
-  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned)

-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
-  uint16_t              reserved;          // number of blocks reserved in memory
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t                   retire_expire;     // expiration count for retired blocks

-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
-                                           // padding
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the page area containing the blocks
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads

+  size_t                    block_size;        // size available in each block (always `>0`)
+  uint8_t*                  page_start;        // start of the blocks
+  mi_heaptag_t              heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+                                               // padding
  #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
  #endif

-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
-
-  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-
-  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
-  void* padding[1];
-  #endif
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
+  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
+  mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;


+// ------------------------------------------------------
+// Object sizes
+// ------------------------------------------------------
+
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*32)  // 160    >= sizeof(mi_page_t)
+#else
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*32)  // 128/96 >= sizeof(mi_page_t)
+#endif
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/4)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
+
+
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+

 // ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
+// Page kinds
 // ------------------------------------------------------

 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an alignment `> MI_BLOCK_ALIGNMENT_MAX`.
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
+  MI_PAGE_SINGLETON // page containing a single block.
+                    // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
 } mi_page_kind_t;


-// ---------------------------------------------------------------
-// a memory id tracks the provenance of arena/OS allocated memory
-// ---------------------------------------------------------------
-
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
-typedef enum mi_memkind_e {
-  MI_MEM_NONE,      // not allocated
-  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
-} mi_memkind_t;
-
-static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
-  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
-}
-
-typedef struct mi_memid_os_info {
-  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        size;               // full allocation size
-} mi_memid_os_info_t;
-
-typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
-} mi_memid_arena_info_t;
-
-typedef struct mi_memid_s {
-  union {
-    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
-  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
-  bool          initially_committed;// `true` if the memory was originally allocated as committed
-  bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
-} mi_memid_t;
-
-
-// ---------------------------------------------------------------
-// Segments contain mimalloc pages
-// ---------------------------------------------------------------
-typedef struct mi_subproc_s mi_subproc_t;
-
-// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
-// Inside segments we allocated fixed size _pages_ that contain blocks.
-typedef struct mi_segment_s {
-  // constant fields
-  mi_memid_t           memid;            // memory id to track provenance
-  bool                 allow_decommit;
-  bool                 allow_purge;
-  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  mi_subproc_t*        subproc;          // segment belongs to sub process
-
-  // segment fields
-  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
-  struct mi_segment_s* prev;
-  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
-  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
-
-  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
-
-  size_t               used;             // count of pages in use (`used <= capacity`)
-  size_t               capacity;         // count of available pages (`#free + used`)
-  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
-
-  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
-  struct mi_segment_s* abandoned_os_prev;
-
-  // layout like this to optimize access in `mi_free`
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
-  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
-} mi_segment_t;
-

 // ------------------------------------------------------
 // Heaps
+//
 // Provide first-class heaps to allocate from.
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
+//
 // Per thread, there is always a default heap that is
 // used for allocation; it is initialized to statically
 // point to an empty heap to avoid initialization checks
@ -461,8 +382,6 @@ typedef struct mi_page_queue_s {
  size_t     block_size;
 } mi_page_queue_t;

-#define MI_BIN_FULL  (MI_BIN_HUGE+1)
-
 // Random context
 typedef struct mi_random_cxt_s {
  uint32_t input[16];
@ -473,7 +392,7 @@ typedef struct mi_random_cxt_s {


 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@ -490,18 +409,18 @@ typedef struct mi_padding_s {

 // A heap owns a set of pages.
 struct mi_heap_s {
-  mi_tld_t*             tld;
-  _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  mi_tld_t*             tld;                                 // thread-local data
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
  mi_random_ctx_t       random;                              // random number context used for secure allocation
  size_t                page_count;                          // total number of pages in the `pages` queues.
  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  size_t                generic_count;                       // how often is mimalloc_generic invoked?
  mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
+  bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
+  bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
  #if MI_GUARDED
  size_t                guarded_size_min;                    // minimal size for guarded objects
@ -511,45 +430,11 @@ struct mi_heap_s {
  size_t                guarded_sample_count;                // current sample count (counting down to 0)
  #endif
  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the heap struct itself (meta or os)
 };


-
-// ------------------------------------------------------
-// Debug
-// ------------------------------------------------------
-
-#if !defined(MI_DEBUG_UNINIT)
-#define MI_DEBUG_UNINIT     (0xD0)
-#endif
-#if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
-#endif
-#if !defined(MI_DEBUG_PADDING)
-#define MI_DEBUG_PADDING    (0xDE)
-#endif
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
 // ------------------------------------------------------
 // Statistics
 // ------------------------------------------------------
@ -575,82 +460,118 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;

 typedef struct mi_stats_s {
-  mi_stat_count_t segments;
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t purged;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t segments_abandoned;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t giant;
-  mi_stat_count_t malloc;
-  mi_stat_count_t segments_cache;
+  mi_stat_count_t   pages;
+  mi_stat_count_t   reserved;
+  mi_stat_count_t   committed;
+  mi_stat_count_t   reset;
+  mi_stat_count_t   purged;
+  mi_stat_count_t   page_committed;
+  mi_stat_count_t   pages_abandoned;
+  mi_stat_count_t   threads;
+  mi_stat_count_t   normal;
+  mi_stat_count_t   huge;
+  mi_stat_count_t   giant;
+  mi_stat_count_t   malloc;
  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t pages_reclaim_on_alloc;
+  mi_stat_counter_t pages_reclaim_on_free;
+  mi_stat_counter_t pages_reabandon_full;
+  mi_stat_counter_t pages_unabandon_busy_wait;
  mi_stat_counter_t mmap_calls;
  mi_stat_counter_t commit_calls;
  mi_stat_counter_t reset_calls;
  mi_stat_counter_t purge_calls;
+  mi_stat_counter_t arena_purges;
  mi_stat_counter_t page_no_retire;
  mi_stat_counter_t searches;
  mi_stat_counter_t normal_count;
  mi_stat_counter_t huge_count;
  mi_stat_counter_t arena_count;
-  mi_stat_counter_t arena_crossover_count;
-  mi_stat_counter_t arena_rollback_count;
  mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+  mi_stat_count_t normal_bins[MI_BIN_COUNT];
 #endif
 } mi_stats_t;


 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);

 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amount)  _mi_stat_adjust_increase( &(stat), amount)
-#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
+#define mi_debug_stat_adjust_increase_mt(stat,amnt,b)           __mi_stat_adjust_increase_mt( &(stat), amnt, b)
+#define mi_debug_stat_adjust_decrease_mt(stat,amnt,b)           __mi_stat_adjust_decrease_mt( &(stat), amnt, b)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
-#define mi_stat_adjuct_increase(stat,amount)  ((void)0)
-#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
+#define mi_debug_stat_adjust_increase(stat,amnt,b)              ((void)0)
+#define mi_debug_stat_adjust_decrease(stat,amnt,b)              ((void)0)
 #endif

-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease( &(heap)->tld->stats.stat, amount)
+
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_stat_decrease( (heap)->tld->stats.stat, amount)


 // ------------------------------------------------------
-// Sub processes do not reclaim or visit segments
-// from other sub processes
+// Sub processes use separate arena's and no heaps/pages/blocks
+// are shared between sub processes.
+// The subprocess structure contains essentially all static variables (except per subprocess :-))
+//
+// Each thread should belong to one sub-process only
 // ------------------------------------------------------

-struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
-  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
-  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
-  mi_memid_t         memid;                   // provenance of this memory block
-};
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
+
+typedef struct mi_subproc_s {
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  _Atomic(int64_t)      purge_expire;                   // expiration is set if any arenas can be purged
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+  mi_page_t*            os_abandoned_pages;             // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
+
+  mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
+} mi_subproc_t;
+

 // ------------------------------------------------------
 // Thread Local data
@ -659,34 +580,57 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;

-// Queue of segments
-typedef struct mi_segment_queue_s {
-  mi_segment_t* first;
-  mi_segment_t* last;
-} mi_segment_queue_t;
-
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_segment_queue_t  small_free;   // queue of segments with free small pages
-  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
-  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  size_t              reclaim_count;// number of reclaimed (abandoned) segments
-  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
-  mi_stats_t*         stats;        // points to tld stats
-} mi_segments_tld_t;
-
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
-  mi_stats_t          stats;         // statistics
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
+  unsigned long long    heartbeat;            // monotonic heartbeat count
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_stats_t            stats;                // statistics
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };

+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
 #endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#ifndef MI_DEBUG_UNINIT
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#ifndef MI_DEBUG_FREED
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#ifndef MI_DEBUG_PADDING
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+
+#endif // MI_TYPES_H
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -16,21 +16,22 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------

 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size
  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
  if (alignment > size) return false;
-  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
  const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize));
+  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
+  return ok;
 }

 #if MI_GUARDED
 static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
  // use over allocation for guarded blocksl
-  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
  const size_t oversize = size + alignment - 1;
  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
-  void* p = mi_align_up_ptr(base, alignment);
+  void* p = _mi_align_up_ptr(base, alignment);
  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
  mi_assert_internal(mi_usable_size(p) >= size);
  mi_assert_internal(_mi_is_aligned(p, alignment));
@ -59,21 +60,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t

  void* p;
  size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
-    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
    if mi_unlikely(offset != 0) {
      // todo: cannot support offset alignment for very large alignments yet
-#if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
-#endif
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
      return NULL;
    }
    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
    // note: no guarded as alignment > 0
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
-    // zero afterwards as only the area from the aligned_p may be committed!
+    p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block
    if (p == NULL) return NULL;
  }
  else {
@ -114,13 +114,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  #endif

  // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
-  }
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}

  if (p != aligned_p) {
    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
@ -177,12 +177,14 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  }

  #if MI_GUARDED
-  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_heap_malloc_use_guarded(heap,size)) {
    return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
  }
  #endif

  // try first if there happens to be a small block available with just the right alignment
+  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
+  // naturally aligned this can be often the case.
  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
    const size_t padsize = size + MI_PADDING_SIZE;
@ -191,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
      if mi_likely(is_aligned)
      {
-        #if MI_STAT>1
-        mi_heap_stat_increase(heap, malloc, size);
-        #endif
+        mi_debug_heap_stat_increase(heap, malloc, size);
        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
        mi_assert_internal(p != NULL);
        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
--- a/src/alloc.c
+++ b/src/alloc.c
@ -30,7 +30,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+  if (page->block_size != 0) { // not the empty heap
+    mi_assert_internal(mi_page_block_size(page) >= size);
+    mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+    mi_assert_internal(_mi_ptr_page(page)==page);
+  }

  // check the free list
  mi_block_t* const block = page->free;
@ -82,7 +86,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_

  #if (MI_STAT>0)
  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_increase(heap, normal, bsize);
    mi_heap_stat_counter_increase(heap, normal_count, 1);
    #if (MI_STAT>1)
@ -130,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
  mi_assert(size <= MI_SMALL_SIZE_MAX);
  #if MI_DEBUG
  const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local
  #endif
  #if (MI_PADDING || MI_GUARDED)
  if (size == 0) { size = sizeof(void*); }
@ -184,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
  else {
    // regular allocation
    mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id());   // heaps are thread local
    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
    mi_track_malloc(p,size,zero);

@ -268,7 +272,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
  // if p == NULL then behave as malloc.
  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  const size_t size = (p==NULL ? 0 : _mi_usable_size(p,"mi_realloc")); 
  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
    mi_assert_internal(p!=NULL);
    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
@ -615,7 +619,6 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  block->next = MI_BLOCK_TAG_GUARDED;

  // set guard page at the end of the block
-  mi_segment_t* const segment = _mi_page_segment(page);
  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
  const size_t os_page_size = _mi_os_page_size();
  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
@ -625,8 +628,11 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
    return NULL;
  }
  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
+  // is ensured in `mi_arena_page_alloc_fresh`.
+  mi_assert_internal(_mi_is_aligned(block, os_page_size));
  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
-  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
    _mi_os_protect(guard_page, os_page_size);
  }
  else {
@ -636,9 +642,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  // align pointer just in front of the guard page
  size_t offset = block_size - os_page_size - obj_size;
  mi_assert_internal(offset > sizeof(mi_block_t));
-  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
    // give up to place it right in front of the guard page if the offset is too large for unalignment
-    offset = MI_BLOCK_ALIGNMENT_MAX;
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
  }
  void* p = (uint8_t*)block + offset;  
  mi_track_align(block, p, offset, obj_size);
@ -659,7 +665,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
  if (block==NULL) return NULL;
-  void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);

  // stats
  mi_track_malloc(p, size, zero);  
@ -668,7 +674,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
    #if MI_STAT>1
    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
    #endif
-    _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1);
+    mi_heap_stat_counter_increase(heap, guarded_alloc_count, 1);
  }
  #if MI_DEBUG>3
  if (p != NULL && zero) {
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@ -1,346 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
-
-#if !defined(MI_IN_ARENA_C)
-#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
-// add includes help an IDE
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-#endif
-
-// Minimal exports for arena-abandoned.
-size_t      mi_arena_id_index(mi_arena_id_t id);
-mi_arena_t* mi_arena_from_index(size_t idx);
-size_t      mi_arena_get_count(void);
-void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
-bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
-
-/* -----------------------------------------------------------
-  Abandoned blocks/segments:
-
-  _mi_arena_segment_clear_abandoned
-  _mi_arena_segment_mark_abandoned
-
-  This is used to atomically abandon/reclaim segments
-  (and crosses the arena API but it is convenient to have here).
-
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment.
-
-  Abandoned segments are atomically marked in the `block_abandoned`
-  bitmap of arenas. Any segments allocated outside arenas are put
-  in the sub-process `abandoned_os_list`. This list is accessed
-  using locks but this should be uncommon and generally uncontended.
-  Reclaim and visiting either scan through the `block_abandoned`
-  bitmaps of the arena's, or visit the `abandoned_os_list`
-
-  A potentially nicer design is to use arena's for everything
-  and perhaps have virtual arena's to map OS allocated memory
-  but this would lack the "density" of our current arena's. TBC.
----------------------------------------------------------- */
-
-
-// reclaim a specific OS abandoned segment; `true` on success.
-// sets the thread_id.
-static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena, remove from list of abandoned os segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
-    return false;  // failed to acquire the lock, we just give up
-  }
-  // remove atomically from the abandoned os list (if possible!)
-  bool reclaimed = false;
-  mi_segment_t* const next = segment->abandoned_os_next;
-  mi_segment_t* const prev = segment->abandoned_os_prev;
-  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
-    #if MI_DEBUG>3
-    // find ourselves in the abandoned list (and check the count)
-    bool found = false;
-    size_t count = 0;
-    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
-      if (current == segment) { found = true; }
-      count++;
-    }
-    mi_assert_internal(found);
-    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
-    #endif
-    // remove (atomically) from the list and reclaim
-    if (prev != NULL) { prev->abandoned_os_next = next; }
-    else { subproc->abandoned_os_list = next; }
-    if (next != NULL) { next->abandoned_os_prev = prev; }
-    else { subproc->abandoned_os_list_tail = prev; }
-    segment->abandoned_os_next = NULL;
-    segment->abandoned_os_prev = NULL;
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
-    if (take_lock) { // don't reset the thread_id when iterating
-      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-    }
-    reclaimed = true;
-  }
-  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
-  return reclaimed;
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // reclaim atomically
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) {
-    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-
-// mark a specific OS segment as abandoned
-static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena; we use a list of abandoned segments
-  mi_subproc_t* const subproc = segment->subproc;
-  mi_lock(&subproc->abandoned_os_lock) {
-    // push on the tail of the list (important for the visitor)
-    mi_segment_t* prev = subproc->abandoned_os_list_tail;
-    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
-    mi_assert_internal(segment->abandoned_os_prev == NULL);
-    mi_assert_internal(segment->abandoned_os_next == NULL);
-    if (prev != NULL) { prev->abandoned_os_next = segment; }
-    else { subproc->abandoned_os_list = segment; }
-    subproc->abandoned_os_list_tail = segment;
-    segment->abandoned_os_prev = prev;
-    segment->abandoned_os_next = NULL;
-    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count);
-    // and release the lock
-  }
-  return;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
-{
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    mi_arena_segment_os_mark_abandoned(segment);
-    return;
-  }
-  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // set abandonment atomically
-  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-
-/* -----------------------------------------------------------
-  Iterate through the abandoned blocks/segments using a cursor.
-  This is used for reclaiming and abandoned block visiting.
----------------------------------------------------------- */
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  current->bitmap_idx = 0;
-  current->subproc = subproc;
-  current->visit_all = visit_all;
-  current->hold_visit_lock = false;
-  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
-  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
-  const size_t max_arena = mi_arena_get_count();
-  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
-    // for a heap that is bound to one arena, only visit that arena
-    current->start = mi_arena_id_index(heap->arena_id);
-    current->end = current->start + 1;
-    current->os_list_count = 0;
-  }
-  else {
-    // otherwise visit all starting at a random location
-    if (abandoned_count > abandoned_list_count && max_arena > 0) {
-      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
-      current->end = current->start + max_arena;
-    }
-    else {
-      current->start = 0;
-      current->end = 0;
-    }
-    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
-  }
-  mi_assert_internal(current->start <= max_arena);
-}
-
-void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
-  if (current->hold_visit_lock) {
-    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
-    current->hold_visit_lock = false;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
-  // try to reclaim an abandoned segment in the arena atomically
-  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  // check that the segment belongs to our sub-process
-  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
-  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
-  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
-  if (segment->subproc != subproc) {
-    // it is from another sub-process, re-mark it and continue searching
-    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
-    return NULL;
-  }
-  else {
-    // success, we unabandoned a segment in our sub-process
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    return segment;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
-  const size_t max_arena = mi_arena_get_count();
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
-  // visit arena's (from the previous cursor)
-  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
-    // index wraps around
-    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
-    mi_arena_t* arena = mi_arena_from_index(arena_idx);
-    if (arena != NULL) {
-      bool has_lock = false;
-      // visit the abandoned fields (starting at previous_idx)
-      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // we only take the arena lock if there are actually abandoned segments present
-          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
-            if (!has_lock) {
-              if (previous->visit_all) {
-                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
-              }
-              // skip to next arena
-              break;
-            }
-          }
-          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
-              if (segment != NULL) {
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
-                return segment;
-              }
-            }
-          }
-        }
-      }
-      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-    }
-  }
-  return NULL;
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
-  // go through the abandoned_os_list
-  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
-  // The lock is released when the cursor is released.
-  if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
-                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
-    if (!previous->hold_visit_lock) {
-      if (previous->visit_all) {
-        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
-      }
-      return NULL; // we cannot get the lock, give up
-    }
-  }
-  // One list entry at a time
-  while (previous->os_list_count > 0) {
-    previous->os_list_count--;
-    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    mi_segment_t* segment = previous->subproc->abandoned_os_list;
-    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-      mi_lock_release(&previous->subproc->abandoned_os_lock);
-      return segment;
-    }
-    // already abandoned, try again
-    mi_lock_release(&previous->subproc->abandoned_os_lock);
-  }
-  // done
-  mi_assert_internal(previous->os_list_count == 0);
-  return NULL;
-}
-
-
-// reclaim abandoned segments
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
-  if (previous->start < previous->end) {
-    // walk the arena
-    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
-    if (segment != NULL) { return segment; }
-  }
-  // no entries in the arena's anymore, walk the abandoned OS list
-  mi_assert_internal(previous->start == previous->end);
-  return mi_arena_segment_clear_abandoned_next_list(previous);
-}
-
-
-bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  // (unfortunately) the visit_abandoned option must be enabled from the start.
-  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
-  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
-    return false;
-  }
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
-  mi_segment_t* segment;
-  bool ok = true;
-  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
-    _mi_arena_segment_mark_abandoned(segment);
-  }
-  _mi_arena_field_cursor_done(&current);
-  return ok;
-}
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  We have a special "mini" allocator just for allocation of meta-data like
+  the heap (`mi_heap_t`) or thread-local data (`mi_tld_t`).
+
+  We reuse the bitmap of the arena's for allocation of 64b blocks inside
+  an arena slice (64KiB).
+  We always ensure that meta data is zero'd (we zero on `free`)
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Meta data allocation
+----------------------------------------------------------- */
+
+#define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
+#define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
+
+#define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE > 4k (even on 32-bit)
+#define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
+#define MI_META_BLOCKS_PER_PAGE   (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE)  // 1024
+#define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
+
+typedef struct mi_meta_page_s  {
+  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
+  mi_memid_t                       memid;   // provenance of the meta-page memory itself
+  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
+} mi_meta_page_t;
+
+static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
+
+
+#if MI_DEBUG > 1
+static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
+  if (block_idx != NULL) {
+    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
+  }
+  return mpage;
+}
+#endif 
+
+static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
+  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
+}
+
+static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
+  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
+  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
+  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
+  return p;
+}
+
+// allocate a fresh meta page and add it to the global list.
+static mi_meta_page_t* mi_meta_page_zalloc(void) {
+  // allocate a fresh arena slice
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
+  mi_memid_t memid;
+  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
+                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
+  if (base == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
+  }
+
+  // guard pages
+  #if MI_SECURE >= 1
+  _mi_os_secure_guard_page_set_at(base, memid.is_pinned);
+  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid.is_pinned);
+  #endif
+  
+  // initialize the page and free block bitmap
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
+  mpage->memid = memid;
+  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
+  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);  
+  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
+
+  // push atomically in front of the meta page list
+  // (note: there is no ABA issue since we never free meta-pages)
+  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  do {
+    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
+  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
+  return mpage;
+}
+
+
+// allocate meta-data
+mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+{
+  mi_assert_internal(pmemid != NULL);
+  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
+  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
+  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+    else {
+      mpage = mi_meta_page_next(mpage);
+    }
+  }
+  // failed to find space in existing pages
+  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
+    // the page list was updated by another thread in the meantime, retry
+    return _mi_meta_zalloc(size,pmemid);
+  }
+  // otherwise, allocate a fresh metapage and try once more
+  mpage = mi_meta_page_zalloc();
+  if (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+  }
+  // if all this failed, allocate from the OS
+  return _mi_os_alloc(size, pmemid);
+}
+
+// free meta-data
+mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (memid.memkind == MI_MEM_META) {
+    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
+    const size_t block_count = memid.mem.meta.block_count;
+    const size_t block_idx   = memid.mem.meta.block_index;
+    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
+    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
+    mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
+    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
+    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
+  }
+  else {
+    _mi_arenas_free(p,size,memid);
+  }
+}
+
+// used for debug output
+bool _mi_meta_is_meta_page(void* p) 
+{
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    if ((void*)mpage == p) return true;
+    mpage = mi_meta_page_next(mpage);    
+  }
+  return false;
+}
--- a/src/arena.c
+++ b/src/arena.c
--- a/src/bitmap.c
+++ b/src/bitmap.c
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -1,110 +1,317 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/

 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H

-/* -----------------------------------------------------------
-  Bitmap definition
----------------------------------------------------------- */
+/* --------------------------------------------------------------------------------
+  Atomic bitmaps with release/acquire guarantees:

-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.

-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.

-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
+      ranges aligned to a bfield.

-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return mi_bitmap_index_create_ex(idx,bitidx);
-}
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).

-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
+      to have no bits set. It is also allowed to briefly have a clear bit even if the
+      chunk has bits set -- as long as we guarantee that the bit will be set later on;
+      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).

-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk. Therefore, when
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
+      then test again to catch any set bits that we may have missed.

-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
+      not find a free page even though it's there (but we accept this as we avoid taking
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
+      for now).

-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
----------------------------------------------------------- */
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
+      The minimum is 1 chunk which is a 32 MiB arena.

-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */

-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+// A word-size bit field.
+typedef size_t mi_bfield_t;

-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
+#define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..

-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit


-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
+} mi_bchunk_t;

-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);

-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
+// The chunkmap is itself a chunk.
+typedef mi_bchunk_t mi_bchunkmap_t;

-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS

+#define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT     (1)
+#if MI_SIZE_BITS > 32
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
+#else
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
 #endif
+#define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
+
+
+// An atomic bitmap
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bitmap_t;
+
+
+static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
+}
+
+static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
+  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap operations
+-------------------------------------------------------------------------------- */
+
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
+typedef bool  mi_xset_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
+
+
+// Required size of a bitmap to represent `bit_count` bits.
+size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
+
+// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+// If `already_set` is not NULL, it is set to count of bits were already all set.
+// (this is used for correct statistics if commiting over a partially committed area)
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_setN(bitmap, idx, 1);
+}
+
+static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_clearN(bitmap, idx, 1);
+}
+
+// Called once a bit is cleared to see if the memory slice can be claimed.
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set);
+
+// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
+// If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag );
+
+
+// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
+// This is used to delay free-ing a page that it at the same time being considered to be
+// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
+
+
+// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for unloading arena's
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
+
+
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
+
+// Visit all set bits in a bitmap (`slice_count == 1`)
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+
+/* ----------------------------------------------------------------------------
+  Binned concurrent bitmap
+  Assigns a size class to each chunk such that small blocks don't cause too
+  much fragmentation since we keep chunks for larger blocks separate.
+---------------------------------------------------------------------------- */
+
+// Size bins; larger bins are allowed to go into smaller bins.
+// SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
+typedef enum mi_bbin_e {
+  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
+  MI_BBIN_SMALL,    // slice_count == 1
+  MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
+  MI_BBIN_MEDIUM,   // slice_count == 8
+  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- only used if MI_ENABLE_LARGE_PAGES is 1
+  MI_BBIN_COUNT
+} mi_bbin_t;
+
+static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
+  return (mi_bbin_t)((int)bbin + 1);
+}
+
+static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
+  if (slice_count==1) return MI_BBIN_SMALL;
+  if (slice_count==8) return MI_BBIN_MEDIUM;
+  #if MI_ENABLE_LARGE_PAGES
+  if (slice_count==MI_BFIELD_BITS) return MI_BBIN_LARGE;
+  #endif
+  return MI_BBIN_OTHER;
+}
+
+// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT];        // 512b
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bbitmap_t;
+
+
+static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
+  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
+}
+
+static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
+  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
+}
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
+
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
+}
+
+
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
+bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Specialized versions for common bit sequence sizes
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+
+// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
+  // if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
+  if (n==0 || n>MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n<=MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
+  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
+}
+
+
+#endif // MI_BITMAP_H
--- a/src/free.c
+++ b/src/free.c
@ -23,9 +23,6 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // Free
 // ------------------------------------------------------

-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
-
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
@ -50,6 +47,40 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
  }
 }

+// Forward declaration for multi-threaded collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept;
+
+// Free a block multi-threaded
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+#endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
+
+  // and atomically try to collect the page if it was abandoned
+  const bool is_owned_now = !mi_tf_is_owned(tf_old);
+  if (is_owned_now) {
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_free_try_collect_mt(page);
+  }
+}
+
+
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
 // `page_start` and `block_size` fields; however these are constant and the page won't be
@ -57,7 +88,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
  mi_assert_internal(page!=NULL && p!=NULL);

-  size_t diff = (uint8_t*)p - page->page_start;
+  size_t diff = (uint8_t*)p - mi_page_start(page);
  size_t adjust;
  if mi_likely(page->block_size_shift != 0) {
    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@ -81,218 +112,153 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 }
 #endif

+
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  MI_UNUSED(segment);
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
  mi_block_check_unguard(page, block, p);
  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }

 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
+  if (p==NULL) return;  // a NULL pointer is seen as abandoned (tid==0) with a full flag set
  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
  mi_block_check_unguard(page, block, p);
-  mi_free_block_mt(page, segment, block);
+  mi_free_block_mt(page, block);
 }

 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
-           else mi_free_generic_mt(page,segment,p);
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
 }

+
 // Get the segment data belonging to a pointer
 // This is just a single `and` in release mode but does further checks in debug mode
 // (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 {
-  MI_UNUSED(msg);
-
-  #if (MI_DEBUG>0)
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
    return NULL;
  }
-  #endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment==NULL) return segment;
-
-  #if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+  mi_page_t* const page = _mi_safe_ptr_page(p);
+  if (page == NULL && p != NULL) {
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
  }
+  return page;
+  #else
+  return _mi_ptr_page(p);
  #endif
-  #if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-  #endif
-
-  return segment;
 }

 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment==NULL) return;
+  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");

-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  #if MI_PAGE_MAP_FLAT                  // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt`
+  if mi_unlikely(page==NULL) return;
+  #endif

-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
-    }
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
+  if mi_likely(xtid == 0) {                        // thread-local free?  `tid==mi_page_thread_id(page) && mi_page_flags(page)==0`
+    // thread-local, aligned, and not a full page
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+  }
+  else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0`
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
+    mi_free_generic_local(page, p);
+  }
+  // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {         // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0`
+    // blocks are aligned (and not a full page)
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_mt(page,block);
  }
  else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(page, segment, p);
-  }
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
+    mi_free_generic_mt(page, p);
+  }  
 }

-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  mi_assert_internal(block!=NULL);
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since `used` is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
-  return true;
-}

 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------

-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
-{
-  // Try to put the block on either the page-local thread free list,
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));

-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));

-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
+  // we own the page now..
+  // safe to collect the thread atomic free list
+  _mi_page_free_collect(page, false);  // update `used` count
+  #if MI_DEBUG > 1
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+  #endif

-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
-{
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE &&
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
-      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
+  // 1. free if the page is free now
+  if (mi_page_all_free(page))
  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
+    // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
+      _mi_arenas_page_unabandon(page);
+    // we can free the page directly
+    _mi_arenas_page_free(page);
+    return;
+  }
+
+  // 2. if the page is not too full, we can try to reclaim it for ourselves
+  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
+  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
+      !mi_page_is_used_at_frac(page,8) 
+      // && !mi_page_is_abandoned_mapped(page)
+     )
+  {
+    // the page has still some blocks in use (but not too many)
+    // reclaim in our heap if compatible, or otherwise abandon again
+    // todo: optimize this check further?
+    // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
+    // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+    mi_heap_t* const heap = mi_prim_get_default_heap();
+    if (heap != (mi_heap_t*)&_mi_heap_empty)  // we did not already terminate our thread (can this happen?
+    {
+      mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
+      if ((tagheap != NULL) &&                         // don't reclaim across heap object types
+          (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
+          // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
+          (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+         )
+      {
+        if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
+          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+          _mi_arenas_page_unabandon(page);
+          _mi_heap_page_reclaim(tagheap, page);
+          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
+          return;
+        }
+      }
    }
  }

-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    _mi_segment_huge_page_free(segment, page, block);
+  // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  if (!mi_page_is_used_at_frac(page,8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+    !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
+    _mi_arenas_page_try_reabandon_to_mapped(page))
+  {
    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
  }

-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
+
+  // not reclaimed or free'd, unown again
+  _mi_page_unown(page);
 }


@ -316,9 +282,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }

 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if mi_unlikely(segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const mi_page_t* const page = mi_checked_ptr_page(p,msg);
+  if mi_unlikely(page==NULL) return 0;
  if mi_likely(!mi_page_has_aligned(page)) {
    const mi_block_t* block = (const mi_block_t*)p;
    return mi_page_usable_size_of(page, block);
@ -513,21 +478,21 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {

 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  #if (MI_STAT < 2)
  MI_UNUSED(block);
-#endif
+  #endif
  mi_heap_t* const heap = mi_heap_get_default();
  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
+  #if (MI_STAT>1)
  const size_t usize = mi_page_usable_size_of(page, block);
  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
+    #if (MI_STAT > 1)
    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
+    #endif
  }
  else {
    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
@ -535,7 +500,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  }
 }
 #else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
@ -553,7 +518,7 @@ static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
  const size_t bsize = mi_page_block_size(page);
  const size_t psize = _mi_os_page_size();
  mi_assert_internal(bsize > psize);
-  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  mi_assert_internal(!page->memid.is_pinned);
  void* gpage = (uint8_t*)block + bsize - psize;
  mi_assert_internal(_mi_is_aligned(gpage, psize));
  _mi_os_unprotect(gpage, psize);
--- a/src/heap.c
+++ b/src/heap.c
@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file

 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"  // mi_prim_get_default_heap

-#include <string.h>  // memset, memcpy
-
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
 #pragma warning(disable:4204)  // non-constant aggregate initializer
 #endif
@ -58,8 +55,6 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(arg2);
  MI_UNUSED(pq);
  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id);
  mi_assert_expensive(_mi_page_is_valid(page));
  return true;
 }
@ -98,7 +93,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  if (mi_page_all_free(page)) {
    // no more used blocks, free the page.
    // note: this will free retired pages as well.
-    _mi_page_free(page, pq, collect >= MI_FORCE);
+    _mi_page_free(page, pq);
  }
  else if (collect == MI_ABANDON) {
    // still used blocks but the thread is done; abandon the page
@ -107,14 +102,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  return true; // don't break
 }

-static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-  return true; // don't break
-}

 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
@ -124,49 +111,19 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  _mi_deferred_free(heap, force);

  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
-  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
-
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  if (
-  #ifdef NDEBUG
-      collect == MI_FORCE
-  #else
-      collect >= MI_FORCE
-  #endif
-    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
-  {
-    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-    // if all memory is freed by now, all segments should be freed.
-    // note: this only collects in the current subprocess
-    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
-  }
-
-  // if abandoning, mark all pages to no longer add to delayed_free
-  if (collect == MI_ABANDON) {
-    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
-  }
-
-  // free all current thread delayed blocks.
-  // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  _mi_heap_delayed_free_all(heap);
+  // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());

  // collect retired pages
  _mi_heap_collect_retired(heap, force);
-
+  
+  // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); }
+  
  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );

-  // collect segments (purge pages, this can be expensive so don't force on abandonment)
-  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
-
-  // if forced, collect thread data cache on program-exit (or shared library unload)
-  if (force && is_main_thread && mi_heap_is_backing(heap)) {
-    _mi_thread_data_collect();  // collect thread data cache
-  }
-
-  // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)  
+  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); 
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld);
 }

 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@ -187,8 +144,12 @@ void mi_collect(bool force) mi_attr_noexcept {
 ----------------------------------------------------------- */

 mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
-  return mi_prim_get_default_heap();
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    mi_thread_init();
+    heap = mi_prim_get_default_heap();
+  }
+  return heap;
 }

 static bool mi_heap_is_default(const mi_heap_t* heap) {
@ -201,39 +162,77 @@ mi_heap_t* mi_heap_get_backing(void) {
  mi_assert_internal(heap!=NULL);
  mi_heap_t* bheap = heap->tld->heap_backing;
  mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  mi_assert_internal(bheap->tld->thread_id == _mi_thread_id());
  return bheap;
 }

-void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
+// todo: make order of parameters consistent (but would that break compat with CPython?)
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
+{
+  mi_assert_internal(heap!=NULL);
+  mi_memid_t memid = heap->memid;
  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;
-  heap->thread_id  = _mi_thread_id();
-  heap->arena_id   = arena_id;
-  heap->no_reclaim = noreclaim;
-  heap->tag        = tag;
-  if (heap == tld->heap_backing) {
+  heap->memid = memid;
+  heap->tld        = tld;  // avoid reading the thread-local tld during initialization
+  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
+  heap->allow_page_reclaim = !noreclaim;
+  heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0);
+  heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+  heap->tag        = heap_tag;
+  if (heap->tld->is_in_threadpool) {
+    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
+    // (but abandoning is good in this case)
+    heap->allow_page_reclaim = false;
+    // and halve the full page retain (possibly to 0)
+    if (heap->full_page_retain >= 0) {
+      heap->full_page_retain = heap->full_page_retain / 4;
+    }
+  }
+
+  if (heap->tld->heap_backing == NULL) {
+    heap->tld->heap_backing = heap;  // first heap becomes the backing heap
    _mi_random_init(&heap->random);
  }
  else {
-    _mi_random_split(&tld->heap_backing->random, &heap->random);
+    _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
  }
  heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
+  //heap->keys[0] = _mi_heap_random_next(heap);
+  //heap->keys[1] = _mi_heap_random_next(heap);*/
  _mi_heap_guarded_init(heap);
+
  // push on the thread local heaps list
  heap->next = heap->tld->heaps;
  heap->tld->heaps = heap;
 }

+mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld) {
+  mi_assert_internal(tld!=NULL);
+  mi_assert(heap_tag >= 0 && heap_tag < 256);
+  // allocate and initialize a heap
+  mi_memid_t memid;
+  mi_heap_t* heap;
+  if (arena_id == _mi_arena_id_none()) {
+    heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  }
+  else {
+    // heaps associated wita a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+  }
+  if (heap==NULL) {
+    _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
+    return NULL;
+  }
+  heap->memid = memid;
+  _mi_heap_init(heap, arena_id, allow_destroy, (uint8_t)heap_tag, tld);
+  return heap;
+}
+
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap == NULL) return NULL;
-  mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
-  return heap;
+  mi_assert_internal(bheap != NULL);
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
 }

 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@ -246,7 +245,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
 }

 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+  return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena);
 }

 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
@ -258,14 +257,14 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
  mi_assert_internal(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+  _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
-  heap->thread_delayed_free = NULL;
+  // heap->thread_delayed_free = NULL;
  heap->page_count = 0;
 }

 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
+static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) {
  mi_assert(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
@ -292,7 +291,9 @@ static void mi_heap_free(mi_heap_t* heap) {
  mi_assert_internal(heap->tld->heaps != NULL);

  // and free the used memory
-  mi_free(heap);
+  if (do_free_mem) {
+    _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  }
 }

 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@ -319,24 +320,24 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(pq);

  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  //_mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);

  // stats
  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, huge, bsize);
  }
-#if (MI_STAT)
+  #if (MI_STAT)
  _mi_page_free_collect(page, false);  // update used count
  const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
  }
  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif

  /// pretend it is all free now
  mi_assert_internal(mi_page_thread_free(page) == NULL);
@ -346,7 +347,8 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  // mi_page_free(page,false);
  page->next = NULL;
  page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+  mi_page_set_heap(page, NULL);
+  _mi_arenas_page_free(page);

  return true; // keep going
 }
@ -367,7 +369,8 @@ static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_he
 void mi_heap_destroy(mi_heap_t* heap) {
  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
+  mi_assert(!heap->allow_page_reclaim);
+  mi_assert(!heap->allow_page_abandon);
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  #if MI_GUARDED
@ -375,9 +378,9 @@ void mi_heap_destroy(mi_heap_t* heap) {
  mi_heap_delete(heap);
  return;
  #else
-  if (!heap->no_reclaim) {
+  if (heap->allow_page_reclaim) {
    _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap);
-    // don't free in case it may contain reclaimed pages
+    // don't free in case it may contain reclaimed pages,
    mi_heap_delete(heap);
  }
  else {
@ -387,7 +390,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
    #endif
    // free all pages
    _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+    mi_heap_free(heap,true);
  }
  #endif
 }
@ -399,7 +402,7 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
  mi_heap_t* curr = heap->tld->heaps;
  while (curr != NULL) {
    mi_heap_t* next = curr->next;
-    if (curr->no_reclaim) {
+    if (!curr->allow_page_reclaim) {
      mi_heap_destroy(curr);
    }
    else {
@ -414,44 +417,30 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
 ----------------------------------------------------------- */

 // Transfer the pages from one heap to the other
-static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
-  mi_assert_internal(heap!=NULL);
-  if (from==NULL || from->page_count == 0) return;
+//static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+//  mi_assert_internal(heap!=NULL);
+//  if (from==NULL || from->page_count == 0) return;
+//
+//  // transfer all pages by appending the queues; this will set a new heap field
+//  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+//    mi_page_queue_t* pq = &heap->pages[i];
+//    mi_page_queue_t* append = &from->pages[i];
+//    size_t pcount = _mi_page_queue_append(heap, pq, append);
+//    heap->page_count += pcount;
+//    from->page_count -= pcount;
+//  }
+//  mi_assert_internal(from->page_count == 0);
+//
+//  // and reset the `from` heap
+//  mi_heap_reset_pages(from);
+//}

-  // reduce the size of the delayed frees
-  _mi_heap_delayed_free_partial(from);
-
-  // transfer all pages by appending the queues; this will set a new heap field
-  // so threads may do delayed frees in either heap for a while.
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
-  // so after this only the new heap will get delayed frees
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_queue_t* append = &from->pages[i];
-    size_t pcount = _mi_page_queue_append(heap, pq, append);
-    heap->page_count += pcount;
-    from->page_count -= pcount;
-  }
-  mi_assert_internal(from->page_count == 0);
-
-  // and do outstanding delayed frees in the `from` heap
-  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
-  // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free_all(from);
-  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
-  #endif
-
-  // and reset the `from` heap
-  mi_heap_reset_pages(from);
-}
-
-// are two heaps compatible with respect to heap-tag, exclusive arena etc.
-static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
-  return (heap1->tag == heap2->tag &&                   // store same kind of objects
-          heap1->arena_id == heap2->arena_id);          // same arena preference
-}
+//// are two heaps compatible with respect to heap-tag, exclusive arena etc.
+//static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
+//  return (heap1->tag == heap2->tag &&                   // store same kind of objects
+//          heap1->tld->subproc == heap2->tld->subproc && // same sub-process
+//          heap1->arena_id == heap2->arena_id);          // same arena preference
+//}

 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
@ -461,17 +450,11 @@ void mi_heap_delete(mi_heap_t* heap)
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;

-  mi_heap_t* bheap = heap->tld->heap_backing;
-  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
-    // transfer still used pages to the backing heap
-    mi_heap_absorb(bheap, heap);
-  }
-  else {
-    // the backing heap abandons its pages
-    _mi_heap_collect_abandon(heap);
-  }
+  // abandon all pages
+  _mi_heap_collect_abandon(heap);
+
  mi_assert_internal(heap->page_count==0);
-  mi_heap_free(heap);
+  mi_heap_free(heap,true);
 }

 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
@ -485,7 +468,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 }


+/* -----------------------------------------------------------
+  Load/unload heaps
+----------------------------------------------------------- */
+void mi_heap_unload(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
+    return;
+  }

+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_heap_collect_abandon(heap);
+  mi_assert_internal(heap->page_count==0);
+
+  // remove from heap list
+  mi_heap_free(heap, false /* but don't actually free the memory */);
+
+  // disassociate from the current thread-local and static state
+  heap->tld = NULL;
+  return;
+}
+
+bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
+  mi_assert(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (heap->tld != NULL) {
+    _mi_warning_message("cannot reload heaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (heap->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(heap->page_count==0);
+
+  // re-associate with the current thread-local and static state
+  heap->tld = mi_heap_get_default()->tld;
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(heap->page_count == 0);
+  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
+    heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return true;
+}

 /* -----------------------------------------------------------
  Analysis
@ -494,11 +533,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 // static since it is not thread safe to access heaps from other threads.
 static mi_heap_t* mi_heap_of_block(const void* p) {
  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(valid);
-  if mi_unlikely(!valid) return NULL;
-  return mi_page_heap(_mi_segment_page_of(segment,p));
+  mi_page_t* page = _mi_ptr_page(p); // TODO: check pointer validity?
+  return mi_page_heap(page);
 }

 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
@ -573,7 +609,7 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
  if (page->used == 0) return true;

  size_t psize;
-  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* const pstart = mi_page_area(page, &psize);
  mi_heap_t* const heap = mi_page_heap(page);
  const size_t bsize    = mi_page_block_size(page);
  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
--- a/src/init.c
+++ b/src/init.c
@ -11,32 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit

+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)

 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0,
-  false, false, false, false,
-  0,       // capacity
-  0,       // reserved capacity
-  { 0 },   // flags
-  false,   // is_zero
-  0,       // retire_expire
-  NULL,    // free
-  NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // heap tag
-  0,       // block_size
-  NULL,    // page_start
+  MI_ATOMIC_VAR_INIT(MI_PAGE_IN_FULL_QUEUE),  // xthread_id  (must set flag to catch NULL on a free)
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // block size shift
+  0,                      // retire_expire
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  0,                      // block_size
+  NULL,                   // page_start
+  0,                      // heap tag
+  false,                  // is_zero
  #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
-  #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xheap
-  NULL, NULL
-  #if MI_INTPTR_SIZE==4
-  , { NULL }
+  { 0, 0 },               // keys
  #endif
+  NULL,                   // xheap
+  NULL, NULL,             // next, prev
+  MI_ARENA_SLICE_SIZE,    // page_committed
+  MI_MEMID_STATIC         // memid
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@ -63,8 +62,8 @@ const mi_page_t _mi_page_empty = {
    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }

 #define MI_STAT_COUNT_NULL()  {0,0,0,0}

@ -82,12 +81,10 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }  \
  MI_STAT_COUNT_END_NULL()

 // --------------------------------------------------------
@ -99,24 +96,83 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------

+static mi_decl_cache_align mi_subproc_t subproc_main
+#if __cplusplus
+= { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+= { 0 };   // C zero initialize
+#endif
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,          // subproc
+  NULL,                   // heap_backing
+  NULL,                   // heaps list
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // tid
-  0,                // cookie
-  0,                // arena id
-  { 0, 0 },         // keys
-  { {0}, {0}, 0, true }, // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next
-  false,            // can reclaim
-  0,                // tag
+  &tld_empty,             // tld
+  NULL,                   // exclusive_arena
+  0,                      // cookie
+  //{ 0, 0 },               // keys
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // generic count
+  NULL,                   // next
+  0,                      // full page retain
+  false,                  // can reclaim
+  true,                   // can eager abandon
+  0,                      // tag
  #if MI_GUARDED
-  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,          // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
  #endif
  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
+};
+
+extern mi_heap_t heap_main;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,          // subproc
+  &heap_main,             // heap_backing
+  &heap_main,             // heaps list
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align mi_heap_t heap_main = {
+  &tld_main,              // thread local data
+  NULL,                   // exclusive arena
+  0,                      // initial cookie
+  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // generic count
+  NULL,                   // next heap
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  0,                      // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
 };


@ -127,39 +183,6 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;

-extern mi_heap_t _mi_heap_main;
-
-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-static mi_decl_cache_align mi_tld_t tld_main = {
-  0, false,
-  &_mi_heap_main, &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0, &mi_subproc_default,
-    &tld_main.stats
-  }, // segments
-  { MI_STATS_NULL }       // stats
-};
-
-mi_decl_cache_align mi_heap_t _mi_heap_main = {
-  &tld_main,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // thread id
-  0,                // initial cookie
-  0,                // arena id
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0, true },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next heap
-  false,            // can reclaim
-  0,                // tag
-  #if MI_GUARDED
-  0, 0, 0, 0, 0,
-  #endif
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
-};

 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.

@ -175,7 +198,7 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
  if (heap->guarded_sample_rate >= 1) {
    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
  }
-  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+  heap->guarded_sample_count = 1 + heap->guarded_sample_seed;  // count down samples
 }

 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@ -204,28 +227,132 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 }
 #endif

-
-static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = 1;
-    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
-    #else
-      _mi_random_init(&_mi_heap_main.random);
-    #endif
-    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
-    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
-    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
-    _mi_heap_guarded_init(&_mi_heap_main);
+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    mi_lock_init(&subproc_main.os_abandoned_pages_lock);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
  }
 }

-mi_heap_t* _mi_heap_main_get(void) {
+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
+  }
+}
+
+// Initialization of the (statically allocated) main heap, and the main tld and subproc.
+static void mi_heap_main_init(void) {
+  if (heap_main.cookie == 0) {
+    mi_subproc_main_init();
+    mi_tld_main_init();
+    // heap
+    heap_main.cookie = 1;
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&heap_main.random);
+    #endif
+    heap_main.cookie  = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
+    _mi_heap_guarded_init(&heap_main);
+    heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
+    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+  }
+}
+
+mi_heap_t* heap_main_get(void) {
  mi_heap_main_init();
-  return &_mi_heap_main;
+  return &heap_main;
+}
+
+
+/* -----------------------------------------------------------
+  Thread local data
+----------------------------------------------------------- */
+
+// Count current and total created threads
+static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
+static _Atomic(size_t)  thread_total_count;
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
+
+// The mimalloc thread local data
+mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
+
+// Allocate fresh tld
+static mi_tld_t* mi_tld_alloc(void) {
+  mi_atomic_increment_relaxed(&thread_count);
+  if (_mi_is_main_thread()) {
+    return &tld_main;
+  }
+  else {
+    // allocate tld meta-data
+    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
+    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
+    mi_memid_t memid;
+    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
+    if (tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
+      return NULL;
+    }
+    tld->memid = memid;
+    tld->heap_backing = NULL;
+    tld->heaps = NULL;
+    tld->subproc = &subproc_main;
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
+    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
+    return tld;
+  }
+}
+
+#define MI_TLD_INVALID  ((mi_tld_t*)1)
+
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    _mi_stats_done(&tld->stats);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  }
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
+  mi_atomic_decrement_relaxed(&thread_count);
+}
+
+static mi_tld_t* mi_tld(void) {
+  mi_tld_t* tld = thread_tld;
+  if (tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
+    thread_tld = &tld_empty;
+  }
+  if (tld==&tld_empty) {
+    thread_tld = tld = mi_tld_alloc();
+  }
+  return tld;
+}
+
+mi_subproc_t* _mi_subproc(void) {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL) {
+    return _mi_subproc_main();
+  }
+  else {
+    return heap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
+  }
 }


@ -233,179 +360,99 @@ mi_heap_t* _mi_heap_main_get(void) {
  Sub process
 ----------------------------------------------------------- */

+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
 mi_subproc_id_t mi_subproc_main(void) {
  return NULL;
 }

 mi_subproc_id_t mi_subproc_new(void) {
-  mi_memid_t memid = _mi_memid_none();
-  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  mi_memid_t memid;
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
  if (subproc == NULL) return NULL;
  subproc->memid = memid;
-  subproc->abandoned_os_list = NULL;
-  mi_lock_init(&subproc->abandoned_os_lock);
-  mi_lock_init(&subproc->abandoned_os_visit_lock);
+  mi_lock_init(&subproc->os_abandoned_pages_lock);
+  mi_lock_init(&subproc->arena_reserve_lock);
  return subproc;
 }

 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
-  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }

 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
  if (subproc_id == NULL) return;
  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
-  // check if there are no abandoned segments still..
+  // check if there are os pages still..
  bool safe_to_delete = false;
-  mi_lock(&subproc->abandoned_os_lock) {
-    if (subproc->abandoned_os_list == NULL) {
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    if (subproc->os_abandoned_pages == NULL) {
      safe_to_delete = true;
    }
  }
  if (!safe_to_delete) return;
+
+  // merge stats back into the main subproc?
+  _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats);
+
  // safe to release
  // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->abandoned_os_lock);
-  mi_lock_done(&subproc->abandoned_os_visit_lock);
-  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+  mi_lock_done(&subproc->os_abandoned_pages_lock);
+  mi_lock_done(&subproc->arena_reserve_lock);
+  _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }

 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_heap_t* heap = mi_heap_get_default();
-  if (heap == NULL) return;
-  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
-  if (heap->tld->segments.subproc != &mi_subproc_default) return;
-  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* tld = mi_tld();
+  if (tld == NULL) return;
+  mi_assert(tld->subproc == &subproc_main);
+  if (tld->subproc != &subproc_main) return;
+  tld->subproc = _mi_subproc_from_id(subproc_id);
 }


-
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Allocate heap data
 ----------------------------------------------------------- */

-// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
-typedef struct mi_thread_data_s {
-  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-  mi_memid_t memid;  // must come last due to zero'ing
-} mi_thread_data_t;
-
-
-// Thread meta-data is allocated directly from the OS. For
-// some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead
-// per thread so we maintain a small cache of recently freed metadata.
-
-#define TD_CACHE_SIZE (32)
-static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
-
-static mi_thread_data_t* mi_thread_data_zalloc(void) {
-  // try to find thread metadata in the cache
-  bool is_zero = false;
-  mi_thread_data_t* td = NULL;
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      // found cached allocation, try use it
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        break;
-      }
-    }
-  }
-
-  // if that fails, allocate as meta data
-  if (td == NULL) {
-    mi_memid_t memid;
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      }
-    }
-    if (td != NULL) {
-      td->memid = memid;
-      is_zero = memid.initially_zero;
-    }
-  }
-
-  if (td != NULL && !is_zero) {
-    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
-  }
-  return td;
-}
-
-static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
-  // try to add the thread metadata to the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td == NULL) {
-      mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
-        return;
-      }
-    }
-  }
-  // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid);
-}
-
-void _mi_thread_data_collect(void) {
-  // free all thread metadata from the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid);
-      }
-    }
-  }
-}
-
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_thread_heap_init(void) {
  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // mi_assert_internal(heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
    // the main heap is statically allocated
    mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
+    _mi_heap_set_default_direct(&heap_main);
    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_zalloc();
-    if (td == NULL) return false;
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
+    mi_tld_t* tld = mi_tld_alloc();

-    mi_tld_t*  tld = &td->tld;
-    mi_heap_t* heap = &td->heap;
-    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
-    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    // allocate and initialize the heap
+    mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
+
+    // associate the heap with this thread
+    // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation)
    _mi_heap_set_default_direct(heap);
+
+    // now that the heap is set for this thread, we can set the thread-local tld.
+    thread_tld = tld;
  }
  return false;
 }

-// initialize thread local data
-void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
-  tld->heap_backing = bheap;
-  tld->heaps = NULL;
-  tld->segments.subproc = &mi_subproc_default;
-  tld->segments.stats = &tld->stats;
-}

 // Free the thread local default heap (called from `mi_thread_done`)
 static bool _mi_thread_heap_done(mi_heap_t* heap) {
  if (!mi_heap_is_initialized(heap)) return true;

  // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty);

  // switch to backing heap
  heap = heap->tld->heap_backing;
@ -425,26 +472,22 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
  mi_assert_internal(mi_heap_is_backing(heap));

  // collect if not the main thread
-  if (heap != &_mi_heap_main) {
+  if (heap != &heap_main) {
    _mi_heap_collect_abandon(heap);
  }

-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
+  // free heap meta data
+  _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);

-  // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    mi_thread_data_free((mi_thread_data_t*)heap);
-  }
-  else {
+  if (heap == &heap_main) {
    #if 0
    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
    _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    mi_assert_internal(heap->tld->heap_backing == &heap_main);
    #endif
  }
+
  return false;
 }

@ -458,7 +501,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //
@ -472,19 +515,14 @@ static void mi_process_setup_auto_thread_done(void) {
  if (tls_initialized) return;
  tls_initialized = true;
  _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_heap_set_default_direct(&heap_main);
 }


 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }

-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}

 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@ -497,8 +535,7 @@ void mi_thread_init(void) mi_attr_noexcept
  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_thread_heap_init()) return;  // returns true if already initialized

-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }

@ -520,14 +557,18 @@ void _mi_thread_done(mi_heap_t* heap)
  }

  // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);

  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
+  if (heap->tld->thread_id != _mi_prim_thread_id()) return;

  // abandon the thread local heap
-  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* tld = heap->tld;
+  _mi_thread_heap_done(heap);  // returns true if already ran
+
+  // free thread local data
+  mi_tld_free(tld);
 }

 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@ -580,7 +621,7 @@ void _mi_process_load(void) {
  }

  // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&heap_main.random);
 }

 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@ -607,7 +648,7 @@ void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  static mi_atomic_once_t process_init;
 	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main
 	#endif
  if (!mi_atomic_once(&process_init)) return;
  _mi_process_is_initialized = true;
@ -615,8 +656,11 @@ void mi_process_init(void) mi_attr_noexcept {
  mi_process_setup_auto_thread_done();

  mi_detect_cpu_features();
-  _mi_os_init();
+  mi_subproc_main_init();
+  mi_tld_main_init();
  mi_heap_main_init();
+  _mi_os_init();
+  _mi_page_map_init();
  #if MI_DEBUG
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
@ -627,7 +671,7 @@ void mi_process_init(void) mi_attr_noexcept {
  #endif
  mi_thread_init();

-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
@ -686,15 +730,14 @@ void mi_cdecl _mi_process_done(void) {
  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
    mi_heap_collect(heap, true /* force */);
    _mi_heap_unsafe_destroy_all(heap);     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all();
-    _mi_segment_map_unsafe_destroy();
+    _mi_arenas_unsafe_destroy_all(heap->tld);
  }

  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
  _mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
  os_preloading = true; // don't call the C runtime anymore
 }

--- a/src/libc.c
+++ b/src/libc.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file

 // --------------------------------------------------------
 // This module defines various std libc functions to reduce
-// the dependency on libc, and also prevent errors caused 
+// the dependency on libc, and also prevent errors caused
 // by some libc implementations when called before `main`
 // executes (due to malloc redirection)
 // --------------------------------------------------------
@ -83,9 +83,9 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) {
 // Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
 // This is mostly to avoid calling these when libc is not yet
 // initialized (and to reduce dependencies)
-// 
-// format:      d i, p x u, s
-// prec:        z l ll L
+//
+// format:      d i, p, x, u, s
+// type:        z l ll L
 // width:       10
 // align-left:  -
 // fill:        0
@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }


-static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end)
 {
  if (x == 0 || base == 0 || base > 16) {
    if (prefix != 0) { mi_outc(prefix, out, end); }
@ -144,8 +144,8 @@ static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char*
      mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
      x = x / base;
    }
-    if (prefix != 0) { 
-      mi_outc(prefix, out, end); 
+    if (prefix != 0) {
+      mi_outc(prefix, out, end);
    }
    size_t len = *out - start;
    // and reverse in-place
@ -171,7 +171,18 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
    char c;
    MI_NEXTC();
    if (c != '%') {
-      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+      if (c == '\\') {
+        MI_NEXTC();
+        switch (c) {
+        case 'e': mi_outc('\x1B', &out, end); break;
+        case 't': mi_outc('\t', &out, end); break;
+        case 'n': mi_outc('\n', &out, end); break;
+        case 'r': mi_outc('\r', &out, end); break;
+        case '\\': mi_outc('\\', &out, end); break;
+        default: /* ignore */ break;
+        }
+      }
+      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
        mi_outc(c, &out, end);
      }
    }
@ -181,7 +192,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      size_t width = 0;
      char   numtype = 'd';
      char   numplus = 0;
-      bool   alignright = true; 
+      bool   alignright = true;
      if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
      if (c == '-') { alignright = false; MI_NEXTC(); }
      if (c == '0') { fill = '0'; MI_NEXTC(); }
@ -191,7 +202,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
          width = (10 * width) + (c - '0'); MI_NEXTC();
        }
        if (c == 0) break;  // extra check due to while
-      }      
+      }
      if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
      else if (c == 'l') {
        numtype = c; MI_NEXTC();
@ -199,7 +210,10 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      }

      char* start = out;
-      if (c == 's') {
+      if (c == '%') {
+        mi_outc('%', &out, end);
+      }
+      else if (c == 's') {
        // string
        const char* s = va_arg(args, const char*);
        mi_outs(s, &out, end);
@ -273,3 +287,127 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
  _mi_vsnprintf(buf, buflen, fmt, args);
  va_end(args);
 }
+
+
+
+// --------------------------------------------------------
+// generic trailing and leading zero count, and popcount
+// --------------------------------------------------------
+
+#if !MI_HAS_FAST_BITSCAN
+
+static size_t mi_ctz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
+}
+
+static size_t mi_clz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
+}
+
+size_t _mi_ctz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
+}
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
+  #endif
+}
+
+#endif // bit scan
+
+#if !MI_HAS_FAST_POPCOUNT
+
+#if MI_SIZE_SIZE == 4
+#define mi_mask_even_bits32      (0x55555555)
+#define mi_mask_even_pairs32     (0x33333333)
+#define mi_mask_even_nibbles32   (0x0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum32(uint32_t x) {
+  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
+  x += (x << 8);
+  x += (x << 16);
+  return (size_t)(x >> 24);
+}
+
+static size_t mi_popcount_generic32(uint32_t x) {
+  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
+  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
+  // into the lower bit-pair:
+  x = x - ((x >> 1) & mi_mask_even_bits32);
+  // add the 2-bit pair results
+  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
+  // add the 4-bit nibble results
+  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
+  // each byte now has a count of its bits, we can sum them now:
+  return mi_byte_sum32(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic32(x);
+}
+
+#else
+#define mi_mask_even_bits64      (0x5555555555555555)
+#define mi_mask_even_pairs64     (0x3333333333333333)
+#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum64(uint64_t x) {
+  x += (x << 8);
+  x += (x << 16);
+  x += (x << 32);
+  return (size_t)(x >> 56);
+}
+
+static size_t mi_popcount_generic64(uint64_t x) {
+  x = x - ((x >> 1) & mi_mask_even_bits64);
+  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
+  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
+  return mi_byte_sum64(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic64(x);
+}
+#endif
+
+#endif // popcount
--- a/src/options.c
+++ b/src/options.c
@ -102,6 +102,14 @@ typedef struct mi_option_desc_s {
 #endif
 #endif

+#ifndef MI_DEFAULT_PAGEMAP_COMMIT
+#if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
+#define MI_DEFAULT_PAGEMAP_COMMIT 1
+#else
+#define MI_DEFAULT_PAGEMAP_COMMIT 0
+#endif
+#endif
+

 static mi_option_desc_t options[_mi_option_last] =
 {
@ -136,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
@ -145,9 +153,8 @@ static mi_option_desc_t options[_mi_option_last] =
  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
  { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@ -162,6 +169,13 @@ static mi_option_desc_t options[_mi_option_last] =
         UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
  { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
  { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
+  { 1,   UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 2,   UNINIT, MI_OPTION(page_full_retain) },
+  { 4,   UNINIT, MI_OPTION(page_max_candidates) },
+  { 0,   UNINIT, MI_OPTION(max_vabits) },
+  { MI_DEFAULT_PAGEMAP_COMMIT, 
+         UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
+  { 2,   UNINIT, MI_OPTION(page_commit_on_demand) },
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -416,7 +430,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[992];
  if (fmt==NULL) return;
  if (!mi_recurse_enter()) return;
  _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
@ -442,6 +456,13 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
  }
 }

+void _mi_output_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, NULL, NULL, fmt, args);
+  va_end(args);
+}
+
 void _mi_trace_message(const char* fmt, ...) {
  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
  va_list args;
--- a/src/os.c
+++ b/src/os.c
@ -9,21 +9,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"

-#define mi_os_stat_increase(stat,amount)      _mi_stat_increase(&_mi_stats_main.stat, amount)
-#define mi_os_stat_decrease(stat,amount)      _mi_stat_decrease(&_mi_stats_main.stat, amount)
-#define mi_os_stat_counter_increase(stat,inc) _mi_stat_counter_increase(&_mi_stats_main.stat, inc)
+// always use main stats for OS calls
+#define os_stats   (&_mi_stats_main)

 /* -----------------------------------------------------------
  Initialization.
 ----------------------------------------------------------- */
-#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
-#if MI_INTPTR_SIZE < 8
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   32
-#else
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   48
-#endif
-#endif
-
 #ifndef MI_DEFAULT_PHYSICAL_MEMORY
 #if MI_INTPTR_SIZE < 8
 #define MI_DEFAULT_PHYSICAL_MEMORY    4*MI_GiB
@ -37,7 +28,7 @@ static mi_os_mem_config_t mi_os_mem_config = {
  0,        // large page size (usually 2MiB)
  4096,     // allocation granularity
  MI_DEFAULT_PHYSICAL_MEMORY,
-  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  MI_MAX_VABITS,  // in `bits.h`
  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
  true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
@ -62,6 +53,18 @@ size_t _mi_os_large_page_size(void) {
  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }

+size_t _mi_os_guard_page_size(void) {
+  const size_t gsize = _mi_os_page_size();
+  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/8));
+  return gsize;
+}
+
+size_t _mi_os_virtual_address_bits(void) {
+  const size_t vbits = mi_os_mem_config.virtual_address_bits;
+  mi_assert(vbits <= MI_MAX_VABITS);
+  return vbits;
+}
+
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
  // if we have access, check the size and alignment requirements
  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
@ -91,73 +94,54 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero);

-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}
-
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
-}
-
-
-/* -----------------------------------------------------------
-  aligned hinting
-------------------------------------------------------------- */
-
-// On systems with enough virtual address bits, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
-// space (64TiB) we use this technique. (but see issue #939)
-#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
  MI_UNUSED(try_alignment); MI_UNUSED(size);
  return NULL;
 }
-#endif
+
+// In secure mode, return the size of a guard page, otherwise 0
+size_t _mi_os_secure_guard_page_size(void) {
+  #if MI_SECURE > 0
+  return _mi_os_guard_page_size();
+  #else
+  return 0;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  const bool ok = (is_pinned ? false : _mi_os_decommit(addr, _mi_os_secure_guard_page_size()));
+  if (!ok) {
+    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
+  }
+  return ok;
+  #else
+  MI_UNUSED(is_pinned);
+  return true;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned) {
+  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), is_pinned);
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_at(void* addr) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
+  #else
+  return true;
+  #endif
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_before(void* addr) {
+  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size());
+}

 /* -----------------------------------------------------------
  Free memory
@ -186,10 +170,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
    void* base = addr;
    // different base? (due to alignment)
    if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);      
+      mi_assert(memid.mem.os.base <= addr);
      base = memid.mem.os.base;
      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) { 
+      if (memid.mem.os.size==0) {
        csize += diff;
      }
      if (still_committed) {
@ -236,8 +220,6 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
  }

-
-
  mi_os_stat_counter_increase(mmap_calls, 1);
  if (p != NULL) {
    mi_os_stat_increase(reserved, size);
@ -270,18 +252,24 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());

-  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
-  if (p == NULL) return NULL;
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
+
+  void* p = NULL;
+  if (try_direct_alloc) {
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  }

  // aligned already?
-  if (((uintptr_t)p % alignment) == 0) {
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
    *base = p;
  }
  else {
    // if not aligned, free it, overallocate, and unmap around it
    #if !MI_TRACK_ASAN
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    if (try_direct_alloc) {
+      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    }
    #endif
    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0)); }
    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
@ -293,10 +281,10 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;

      // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
      *base = p; // remember the base
-      p = mi_align_up_ptr(p, alignment);
+      p = _mi_align_up_ptr(p, alignment);

      // explicitly commit only the aligned part
      if (commit) {
@ -309,7 +297,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;

      // and selectively unmap parts around the over-allocated area.
-      void* aligned_p = mi_align_up_ptr(p, alignment);
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
      size_t post_size = over_size - pre_size - mid_size;
@ -339,7 +327,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
  bool os_is_zero  = false;
  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
  }
  return p;
 }
@ -355,9 +343,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  bool os_is_large = false;
  bool os_is_zero  = false;
  void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
    memid->mem.os.base = os_base;
    // memid->mem.os.alignment = alignment;
    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
@ -365,6 +353,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  return p;
 }

+void* _mi_os_zalloc(size_t size, mi_memid_t* memid) {
+  void* p = _mi_os_alloc(size, memid);
+  if (p == NULL) return NULL;
+
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
+  return p;
+}
+
 /* -----------------------------------------------------------
  OS aligned allocation with an offset. This is used
  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@ -374,11 +374,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 ----------------------------------------------------------- */

 void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
-  mi_assert(offset <= MI_SEGMENT_SIZE);
  mi_assert(offset <= size);
  mi_assert((alignment % _mi_os_page_size()) == 0);
  *memid = _mi_memid_none();
-  if (offset > MI_SEGMENT_SIZE) return NULL;
  if (offset == 0) {
    // regular aligned allocation
    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
@ -411,11 +409,11 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
  if (newsize != NULL) *newsize = 0;
  if (size == 0 || addr == NULL) return NULL;

-  // page align conservatively within the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+  // page align conservatively within the range, or liberally straddling pages outside the range
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
    : mi_align_down_ptr(addr, _mi_os_page_size()));
  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+    : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
  if (diff <= 0) return NULL;

@ -526,7 +524,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
    return needs_recommit;
  }
  else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
      _mi_os_reset(p, size);
    }
    return false;  // needs no recommit
@ -591,15 +589,14 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
    start = huge_start;
    if (start == 0) {
      // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
    #endif
    }
    end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));

  if (total_size != NULL) *total_size = size;
  return (uint8_t*)start;
@ -612,7 +609,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif

-// Allocate MI_SEGMENT_SIZE aligned huge pages
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
  *memid = _mi_memid_none();
  if (psize != NULL) *psize = 0;
@ -674,7 +671,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
  if (page != 0) {
    mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(start, *psize, true /* is committed */, all_zero, true /* is_large */);
    memid->memkind = MI_MEM_OS_HUGE;
    mi_assert(memid->is_pinned);
    #ifdef MI_TRACK_ASAN
@ -727,3 +724,49 @@ int _mi_os_numa_node_get(void) {
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
  return (int)numa_node;
 }
+
+
+/* ----------------------------------------------------------------------------
+  Public API
+-----------------------------------------------------------------------------*/
+#if 0
+mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
+  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
+}
+
+static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  mi_memid_t memid = _mi_memid_none();
+  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
+  if (p == NULL) return p;
+  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
+  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
+  if (base != NULL) { *base = memid.mem.os.base;  }
+  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
+  if (!memid.initially_zero && memid.initially_committed) {
+    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
+  }
+  return p;
+}
+
+mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
+}
+
+mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
+}
+
+mi_decl_export void  mi_os_free(void* p, size_t size) {
+  if (p==NULL || size == 0) return;
+  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
+  _mi_os_free(p, size, memid);
+}
+
+mi_decl_export void  mi_os_commit(void* p, size_t size) {
+  _mi_os_commit(p, size, NULL);
+}
+
+mi_decl_export void  mi_os_decommit(void* p, size_t size) {
+  _mi_os_decommit(p, size);
+}
+#endif
--- a/src/page-map.c
+++ b/src/page-map.c
@ -0,0 +1,329 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+#if MI_PAGE_MAP_FLAT 
+
+// The page-map contains a byte for each 64kb slice in the address space.
+// For an address `a` where `ofs = _mi_page_map[a >> 16]`:
+// 0 = unused
+// 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
+// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
+//
+// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
+// A full 256 TiB address space (48 bit) needs a 4 GiB page map.
+// A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
+
+mi_decl_cache_align uint8_t* _mi_page_map = NULL;
+static void*        mi_page_map_max_address = NULL;
+static mi_memid_t   mi_page_map_memid;
+
+#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
+static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
+
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
+  const size_t reserve_size = bitmap_size + page_map_size;
+  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (base==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(base, reserve_size);
+  }
+  if (bitmap_size > 0) {
+    mi_page_map_commit = (mi_bitmap_t*)base;
+    _mi_os_commit(mi_page_map_commit, bitmap_size, NULL);
+    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
+  }
+  _mi_page_map = base + bitmap_size;
+
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!commit) {
+    mi_page_map_ensure_committed(0, 1);
+  }
+  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  // we always set the commit bits so we can track what ranges are in-use.
+  // we only actually commit if the map wasn't committed fully already.
+  if (mi_page_map_commit != NULL) {
+    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
+        // this may race, in which case we do multiple commits (which is ok)        
+        bool is_zero;
+        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+        _mi_os_commit(start, size, &is_zero);
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }        
+        mi_bitmap_set(mi_page_map_commit, i);
+      }
+    }
+  }
+  #if MI_DEBUG > 0
+  _mi_page_map[idx] = 0;
+  _mi_page_map[idx+slice_count-1] = 0;
+  #endif
+}
+
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page);
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+
+  mi_page_map_ensure_committed(idx, slice_count);
+
+  // set the offsets
+  for (size_t i = 0; i < slice_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
+  }
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  const uintptr_t index = _mi_page_map_index(start);
+  mi_page_map_ensure_committed(index, slice_count); // we commit the range in total; todo: scan the commit bits and clear only those ranges?
+  _mi_memzero(&_mi_page_map[index], slice_count);
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  const uintptr_t idx = _mi_page_map_index(p);
+  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  const uintptr_t ofs = _mi_page_map[idx];
+  if mi_unlikely(ofs == 0) return NULL;
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#else
+
+// A 2-level page map
+#define MI_PAGE_MAP_SUB_SIZE    (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
+
+mi_decl_cache_align mi_page_t*** _mi_page_map;
+static void*        mi_page_map_max_address;
+static mi_memid_t   mi_page_map_memid;
+
+static _Atomic(mi_bfield_t)  mi_page_map_commit; 
+
+static mi_page_t** mi_page_map_ensure_committed(size_t idx);
+static mi_page_t** mi_page_map_ensure_at(size_t idx);
+static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_assert(MI_MAX_VABITS >= vbits);
+  mi_page_map_max_address = (void*)(MI_PU(1) << vbits);
+  const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  mi_assert(page_map_count <= MI_PAGE_MAP_COUNT);
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
+  const size_t reserve_size = page_map_size + os_page_size;
+  const bool commit = page_map_size <= 64*MI_KiB || 
+                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit(); 
+  _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0)));
+
+  // note: for the NULL range we only commit one OS page (in the map and sub)
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(&_mi_page_map[0], os_page_size, NULL);  // commit first part of the map
+  }
+  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 subs at the end already
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
+  }
+  _mi_page_map[0][0] = (mi_page_t*)&_mi_page_empty;       // caught in `mi_free`
+  
+  mi_assert_internal(_mi_ptr_page(NULL)==&_mi_page_empty);
+  return true;
+}
+
+
+#define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS)
+
+static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
+  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
+  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT; 
+  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
+  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
+  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
+}
+
+static mi_page_t** mi_page_map_ensure_committed(size_t idx) {
+  size_t bit_idx;
+  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
+    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
+    _mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_page_t**), NULL);
+    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
+  }
+  return _mi_page_map[idx];
+}
+
+static mi_page_t** mi_page_map_ensure_at(size_t idx) {
+  mi_page_t** sub = mi_page_map_ensure_committed(idx);
+  if mi_unlikely(sub == NULL) {
+    // sub map not yet allocated, alloc now
+    mi_memid_t memid;
+    sub = (mi_page_t**)_mi_os_alloc(MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), &memid);
+    mi_page_t** expect = NULL;
+    if (!mi_atomic_cas_strong_acq_rel(((_Atomic(mi_page_t**)*)&_mi_page_map[idx]), &expect, sub)) {
+      // another thread already allocated it.. free and continue
+      _mi_os_free(sub, MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), memid);
+      sub = expect;
+      mi_assert_internal(sub!=NULL);
+    }
+    if (sub == NULL) {
+      _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
+    }
+  }
+  return sub;
+}
+
+static void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  while (slice_count > 0) {
+    mi_page_t** sub = mi_page_map_ensure_at(idx);
+    // set the offsets for the page
+    while (sub_idx < MI_PAGE_MAP_SUB_COUNT) {
+      sub[sub_idx] = page;
+      slice_count--; if (slice_count == 0) return;      
+      sub_idx++;      
+    }
+    idx++; // potentially wrap around to the next idx
+    sub_idx = 0; 
+  }
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page, sub_idx);
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  size_t slice_count;
+  size_t sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p,&sub_idx);
+  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub==NULL) return NULL;
+  return sub[sub_idx];
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#endif
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"     
+#include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file


 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }

 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }

 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
 }

 /* -----------------------------------------------------------
@ -76,7 +76,7 @@ static inline uint8_t mi_bin(size_t size) {
    bin = (uint8_t)wsize;
  }
  #endif
-  else if (wsize > MI_LARGE_OBJ_WSIZE_MAX) {
+  else if (wsize > MI_LARGE_MAX_OBJ_WSIZE) {
    bin = MI_BIN_HUGE;
  }
  else {
@ -84,8 +84,9 @@ static inline uint8_t mi_bin(size_t size) {
    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
    #endif
    wsize--;
-    // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    mi_assert_internal(wsize!=0);
+    // find the highest bit position
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -111,8 +112,8 @@ size_t _mi_bin_size(uint8_t bin) {
 }

 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
  }
  else {
@ -210,8 +211,8 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
-                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_heap_t* heap = mi_page_heap(page);
  if (page->prev != NULL) page->prev->next = page->next;
@ -226,7 +227,6 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  heap->page_count--;
  page->next = NULL;
  page->prev = NULL;
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }

@ -242,7 +242,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -259,6 +259,34 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
  heap->page_count++;
 }

+static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+
+  page->prev = queue->last;
+  page->next = NULL;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = page;
+    queue->last = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  if (queue->first == page) {
+    mi_heap_queue_first_update(heap, queue);
+  }
+  heap->page_count++;
+}
+
 static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(mi_page_queue_contains(queue, page));
@ -317,8 +345,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
      page->prev = to->first;
      page->next = next;
      to->first->next = page;
-      if (next != NULL) { 
-        next->prev = page; 
+      if (next != NULL) {
+        next->prev = page;
      }
      else {
        to->last = page;
@ -356,13 +384,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
-    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
-    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
-    // that after appending only the new heap will be used for delayed free operations.
-    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    mi_page_set_heap(page, heap);
    count++;
  }

--- a/src/page.c
+++ b/src/page.c
@ -36,14 +36,15 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }

-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+//static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);

 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page) == page);
  size_t count = 0;
  while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
    count++;
    head = mi_block_next(page, head);
  }
@ -59,7 +60,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {

 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
  size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
  mi_block_t* start = (mi_block_t*)page_area;
  mi_block_t* end   = (mi_block_t*)(page_area + psize);
  while(p != NULL) {
@ -83,10 +84,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  mi_assert_internal(page->capacity <= page->reserved);

  // const size_t bsize = mi_page_block_size(page);
-  mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
+  // uint8_t* start = mi_page_start(page);
  //mi_assert_internal(start + page->capacity*page->block_size == page->top);

  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -121,64 +119,25 @@ bool _mi_page_is_valid(mi_page_t* page) {
  #if MI_SECURE
  mi_assert_internal(page->keys[0] != 0);
  #endif
-  if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
-    #if MI_HUGE_PAGE_ABANDON
-    if (segment->page_kind != MI_PAGE_HUGE)
-    #endif
+  if (!mi_page_is_abandoned(page)) {
+    //mi_assert_internal(!_mi_process_is_initialized);
    {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
+      // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
  return true;
 }
 #endif

-void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
-    mi_atomic_yield();
-  }
-}
-
-bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfreex;
-  mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;
-  size_t yield_count = 0;
-  do {
-    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
-    tfreex = mi_tf_set_delayed(tfree, delay);
-    old_delay = mi_tf_delayed(tfree);
-    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
-      if (yield_count >= 4) return false;  // give up after 4 tries
-      yield_count++;
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
-      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
-    }
-    else if (delay == old_delay) {
-      break; // avoid atomic operation if already equal
-    }
-    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never-delayed flag set
-    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  return true; // success
-}

 /* -----------------------------------------------------------
  Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */

 // Collect the local `thread_free` list using an atomic exchange.
-// Note: The exchange must be done atomically as this is used right after
-// moving to the full list in `mi_page_collect_ex` and we need to
-// ensure that there was no race where the page became unfull just before the move.
 static void _mi_page_thread_free_collect(mi_page_t* page)
 {
  mi_block_t* head;
@ -186,21 +145,21 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
  do {
    head = mi_tf_block(tfree);
-    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
-
-  // return if the list is empty
-  if (head == NULL) return;
+    if (head == NULL) return; // return if the list is empty
+    tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
+  mi_assert_internal(head != NULL);

  // find the tail -- also to get a proper count (without data races)
  size_t max_count = page->capacity; // cannot collect more than capacity
  size_t count = 1;
  mi_block_t* tail = head;
  mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) {
    count++;
    tail = next;
  }
+
  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
  if (count > max_count) {
    _mi_error_message(EFAULT, "corrupted thread-free list\n");
@ -219,9 +178,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  mi_assert_internal(page!=NULL);

  // collect the thread free list
-  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
-    _mi_page_thread_free_collect(page);
-  }
+  _mi_page_thread_free_collect(page);

  // and the local free list
  if (page->local_free != NULL) {
@ -254,43 +211,83 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  Page fresh and retire
 ----------------------------------------------------------- */

+/*
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  // mi_page_set_heap(page, heap);
+  // _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+
  mi_assert_expensive(mi_page_is_valid_init(page));
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
+  // mi_assert_internal(mi_page_heap(page) == heap);
+  // mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);

  // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  mi_page_queue_push(heap, pq, page);
  mi_assert_expensive(_mi_page_is_valid(page));
 }
+*/
+
+// called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
+void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page)
+{
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+
+  mi_page_set_heap(page,heap);
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_queue_push_at_end(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  if (mi_page_all_free(page)) {
+    _mi_page_free(page, pq);
+  }
+  else {
+    mi_page_queue_remove(pq, page);
+    mi_tld_t* tld = page->heap->tld;
+    mi_page_set_heap(page, NULL);
+    _mi_arenas_page_abandon(page);  
+    _mi_arenas_collect(false, false, tld); // allow purging
+  }
+}
+

 // allocate a fresh page from a segment
 static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
  #if !MI_HUGE_PAGE_ABANDON
  mi_assert_internal(pq != NULL);
  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
-  #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
+  #endif  
+  mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment);
  if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
    return NULL;
  }
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
-  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
-  // a fresh page was found, initialize it
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
-  mi_assert_internal(full_block_size >= block_size);
-  mi_page_init(heap, page, full_block_size, heap->tld);
+  if (mi_page_is_abandoned(page)) {
+    _mi_heap_page_reclaim(heap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        mi_page_extend_free(heap, page);
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
+  }
+  else if (pq != NULL) {
+    mi_page_queue_push(heap, pq, page);
+  }
  mi_heap_stat_increase(heap, pages, 1);
-  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
  mi_assert_expensive(_mi_page_is_valid(page));
  return page;
 }
@ -301,55 +298,21 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
  if (page==NULL) return NULL;
  mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
  return page;
 }

-/* -----------------------------------------------------------
-   Do any delayed frees
-   (put there by other threads if they deallocated in a full page)
----------------------------------------------------------- */
-void _mi_heap_delayed_free_all(mi_heap_t* heap) {
-  while (!_mi_heap_delayed_free_partial(heap)) {
-    mi_atomic_yield();
-  }
-}
-
-// returns true if all delayed frees were processed
-bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
-  bool all_freed = true;
-
-  // and free them all
-  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
-    // use internal free instead of regular one to keep stats etc correct
-    if (!_mi_free_delayed_block(block)) {
-      // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
-      // into the delayed free list
-      all_freed = false;
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-    block = next;
-  }
-  return all_freed;
-}

 /* -----------------------------------------------------------
  Unfull, abandon, free and retire
 ----------------------------------------------------------- */

-// Move a page from the full list back to a regular list
+// Move a page from the full list back to a regular list (called from thread-local mi_free)
 void _mi_page_unfull(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_is_in_full(page));
+  mi_assert_internal(!mi_page_heap(page)->allow_page_abandon);
  if (!mi_page_is_in_full(page)) return;

  mi_heap_t* heap = mi_page_heap(page);
@ -365,85 +328,40 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(!mi_page_immediate_available(page));
  mi_assert_internal(!mi_page_is_in_full(page));

-  if (mi_page_is_in_full(page)) return;
-  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
-  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
-}
-
-
-// Abandon a page with used blocks at the end of a thread.
-// Note: only call if it is ensured that no references exist from
-// the `page->heap->thread_delayed_free` into this page.
-// Currently only called through `mi_heap_collect_ex` which ensures this.
-void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
-  mi_assert_internal(page != NULL);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(mi_page_heap(page) != NULL);
-
-  mi_heap_t* pheap = mi_page_heap(page);
-
-  // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
-  // page is no longer associated with our heap
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_page_set_heap(page, NULL);
-
-#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
-  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
-    mi_assert_internal(_mi_ptr_page(block) != page);
-  }
-#endif
-
-  // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
-}
-
-// force abandon a page
-void _mi_page_force_abandon(mi_page_t* page) {
  mi_heap_t* heap = mi_page_heap(page);
-  // mark page as not using delayed free
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-
-  // ensure this page is no longer in the heap delayed free list
-  _mi_heap_delayed_free_all(heap);
-  // We can still access the page meta-info even if it is freed as we ensure 
-  // in `mi_segment_force_abandon` that the segment is not freed (yet)
-  if (page->capacity == 0) return; // it may have been freed now
-
-  // and now unlink it from the page queue and abandon (or free)
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  if (mi_page_all_free(page)) {
-    _mi_page_free(page, pq, false);
-  }
-  else {
+  if (heap->allow_page_abandon) {
+    // abandon full pages
    _mi_page_abandon(page, pq);
  }
+  else {
+    // put full pages in a heap local queue
+    if (mi_page_is_in_full(page)) return;
+    mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
+    _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+  }
 }

+
 // Free a page with no more free blocks
-void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+  // mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);

  // no more aligned blocks in here
  mi_page_set_has_aligned(page, false);

  // remove from the page list
  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments;
  mi_page_queue_remove(pq, page);

  // and free it
+  mi_heap_t* heap = page->heap;
  mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  _mi_arenas_page_free(page);
+  _mi_arenas_collect(false, false, heap->tld);  // allow purging
 }

 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@ -473,9 +391,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
  const size_t bsize = mi_page_block_size(page);
  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
    if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_heap_t* heap = mi_page_heap(page);
+      mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_assert_internal(pq >= heap->pages);
      const size_t index = pq - heap->pages;
      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@ -486,7 +404,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
    }
  }
  #endif
-  _mi_page_free(page, pq, false);
+  _mi_page_free(page, pq);
 }

 // free retired pages: we don't need to look at the entire queues
@ -501,7 +419,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
      if (mi_page_all_free(page)) {
        page->retire_expire--;
        if (force || page->retire_expire == 0) {
-          _mi_page_free(pq->first, pq, force);
+          _mi_page_free(pq->first, pq);
        }
        else {
          // keep retired, update min/max
@ -519,6 +437,36 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 }


+static void mi_heap_collect_full_pages(mi_heap_t* heap) {
+  // note: normally full pages get immediately abandoned and the full queue is always empty
+  // this path is only used if abandoning is disabled due to a destroy-able heap or options
+  // set by the user.
+  mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL];
+  for (mi_page_t* page = pq->first; page != NULL; ) {
+    mi_page_t* next = page->next;         // get next in case we free the page
+    _mi_page_free_collect(page, false);   // register concurrent free's
+    // no longer full?
+    if (!mi_page_is_full(page)) {
+      if (mi_page_all_free(page)) {
+        _mi_page_free(page, pq);
+      }
+      else {
+        _mi_page_unfull(page);
+      }
+    }
+    page = next;
+  }
+}
+
+static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) {
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+  // collect retired pages
+  _mi_heap_collect_retired(heap, false);
+  // collect full pages that had concurrent free's
+  mi_heap_collect_full_pages(heap);
+}
+
 /* -----------------------------------------------------------
  Initialize the initial free list in a page.
  In secure mode we initialize a randomized list by
@ -531,7 +479,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {

 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
  MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -589,7 +537,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
  MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -617,7 +565,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */

 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@ -628,9 +576,9 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert(page->free == NULL);
  mi_assert(page->local_free == NULL);
  if (page->free != NULL) return;
@ -639,12 +587,12 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)

  size_t page_size;
  //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  mi_page_area(page, &page_size);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);

  // calculate the extend count
  const size_t bsize = mi_page_block_size(page);
-  size_t extend = page->reserved - page->capacity;
+  size_t extend = (size_t)page->reserved - page->capacity;
  mi_assert_internal(extend > 0);

  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
@ -660,56 +608,56 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
  mi_assert_internal(extend < (1UL<<16));

+  // commit on demand?
+  if (page->slice_committed > 0) {
+    const size_t needed_size = (page->capacity + extend)*bsize;
+    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
+    if (needed_commit > page->slice_committed) {      
+      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
+      _mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL);
+      page->slice_committed = needed_commit;
+    }
+  }
+
  // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
  }
  else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats);
  }
  // enable the new free list
  page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
  mi_assert_expensive(mi_page_is_valid_init(page));
 }

-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
  mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
  mi_page_set_heap(page, heap);
-  page->block_size = block_size;
+
  size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
-  mi_track_mem_noaccess(page->page_start,page_size);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
  mi_assert_internal(page->reserved > 0);
  #if (MI_PADDING || MI_ENCODE_FREELIST)
  page->keys[0] = _mi_heap_random_next(heap);
  page->keys[1] = _mi_heap_random_next(heap);
  #endif
-  page->free_is_zero = page->is_zero_init;
  #if MI_DEBUG>2
-  if (page->is_zero_init) {
-    mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+  if (page->memid.initially_zero) {
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
  }
  #endif
-  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
-  }
-  else {
-    page->block_size_shift = 0;
-  }

  mi_assert_internal(page->capacity == 0);
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(page->xthread_free == 1);
  mi_assert_internal(page->next == NULL);
  mi_assert_internal(page->prev == NULL);
  mi_assert_internal(page->retire_expire == 0);
@ -718,11 +666,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->keys[0] != 0);
  mi_assert_internal(page->keys[1] != 0);
  #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
  mi_assert_expensive(mi_page_is_valid_init(page));

  // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  mi_page_extend_free(heap,page);
  mi_assert(mi_page_immediate_available(page));
 }

@ -731,40 +679,29 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  Find pages with free blocks
 -------------------------------------------------------------*/

-// search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (4)
-
-// is the page not yet used up to its reserved space?
-static bool mi_page_is_expandable(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(page->capacity <= page->reserved);
-  return (page->capacity < page->reserved);
-}
-
-
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
  // search through the pages in "next fit" order
  #if MI_STAT
  size_t count = 0;
  #endif
-  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
+  long full_page_retain = heap->full_page_retain;
  mi_page_t* page_candidate = NULL;  // a page with free space
  mi_page_t* page = pq->first;

  while (page != NULL)
  {
-    mi_page_t* next = page->next; // remember next
+    mi_page_t* next = page->next; // remember next (as this page can move to another queue)
    #if MI_STAT
    count++;
    #endif
-    candidate_count++;
+    candidate_limit--;

    // collect freed blocks by us and other threads
    _mi_page_free_collect(page, false);

-  #if MI_MAX_CANDIDATE_SEARCH > 1
    // search up to N pages for a best candidate

    // is the local free list non-empty?
@ -773,28 +710,36 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
    // if the page is completely full, move it to the `mi_pages_full`
    // queue so we don't visit long-lived pages too often.
    if (!immediate_available && !mi_page_is_expandable(page)) {
-      mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
-      mi_page_to_full(page, pq);
+      full_page_retain--;
+      if (full_page_retain < 0) {
+        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+        mi_page_to_full(page, pq);
+      }
    }
    else {
      // the page has free space, make it a candidate
      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
      if (page_candidate == NULL) {
        page_candidate = page;
-        candidate_count = 0;
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
+      }
+      else if (mi_page_all_free(page_candidate)) {
+        _mi_page_free(page_candidate, pq);
+        page_candidate = page;
      }
      // prefer to reuse fuller pages (in the hope the less used page gets freed)
-      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
        page_candidate = page;
      }
      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
-      if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) {
+      if (immediate_available || candidate_limit <= 0) {
        mi_assert_internal(page_candidate!=NULL);
        break;
      }
    }
-  #else
-    // first-fit algorithm
+
+  #if 0
+    // first-fit algorithm without candidates
    // If the page contains free blocks, we are done
    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
      break;  // pick this one
@ -809,26 +754,32 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
    page = next;
  } // for each page

-  mi_heap_stat_counter_increase(heap, searches, count);
+  mi_debug_heap_stat_counter_increase(heap, searches, count);

  // set the page to the best candidate
  if (page_candidate != NULL) {
    page = page_candidate;
  }
-  if (page != NULL && !mi_page_immediate_available(page)) {
-    mi_assert_internal(mi_page_is_expandable(page));
-    mi_page_extend_free(heap, page, heap->tld);
+  if (page != NULL) {
+    if (!mi_page_immediate_available(page)) {
+      mi_assert_internal(mi_page_is_expandable(page));
+      mi_page_extend_free(heap, page);
+    }
+    mi_assert_internal(mi_page_immediate_available(page));
  }

  if (page == NULL) {
    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    if (page == NULL && first_try) {
      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
      page = mi_page_queue_find_free_ex(heap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    }
  }
  else {
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    // move the page to the front of the queue
    mi_page_queue_move_to_front(heap, pq, page);
    page->retire_expire = 0;
@ -843,15 +794,16 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p


 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap, size);
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
+  // mi_page_queue_t* pq = mi_page_queue(heap, size);
+  mi_assert_internal(!mi_page_queue_is_huge(pq));

  // check the first page: we even do this with candidate search or otherwise we re-search every time
  mi_page_t* page = pq->first;
  if (page != NULL) {
   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+      mi_page_extend_free(heap, page);
      mi_assert_internal(mi_page_immediate_available(page));
    }
    else
@ -902,13 +854,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
-  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
+  const size_t block_size = _mi_os_good_alloc_size(size);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
  #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = NULL;
+  #error todo.
  #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  // mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
  mi_assert_internal(mi_page_queue_is_huge(pq));
  #endif
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@ -916,10 +868,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
    mi_assert_internal(mi_page_block_size(page) >= size);
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(mi_page_is_huge(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
-    mi_assert_internal(_mi_page_segment(page)->used==1);
+    mi_assert_internal(mi_page_is_singleton(page));
    #if MI_HUGE_PAGE_ABANDON
-    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_assert_internal(mi_page_is_abandoned(page));
    mi_page_set_heap(page, NULL);
    #endif
    mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
@ -932,30 +883,30 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
-  // huge allocation?
  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
-    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      return NULL;
-    }
-    else {
-      return mi_huge_page_alloc(heap,size,huge_alignment);
-    }
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
+  }
+  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
+  // huge allocation?
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
+    return mi_huge_page_alloc(heap,size,huge_alignment,pq);
  }
  else {
    // otherwise find a page with free blocks in our size segregated queues
    #if MI_PADDING
    mi_assert_internal(size >= MI_PADDING_SIZE);
    #endif
-    return mi_find_free_page(heap, size);
+    return mi_find_free_page(heap, pq);
  }
 }

+
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
-// very large requested alignments in which case we use a huge segment.
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
+// very large requested alignments in which case we use a huge singleton page.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
  mi_assert_internal(heap != NULL);
@ -967,15 +918,16 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
  }
  mi_assert_internal(mi_heap_is_initialized(heap));

-  // call potential deferred free routines
-  _mi_deferred_free(heap, false);
-
-  // free delayed frees from other threads (but skip contended ones)
-  _mi_heap_delayed_free_partial(heap);
+  // collect every N generic mallocs
+  if mi_unlikely(heap->generic_count++ > 10000) {
+    heap->generic_count = 0;
+    mi_heap_generic_collect(heap);
+  }

  // find (or allocate) a page of the right size
  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
+    mi_heap_generic_collect(heap);
    mi_heap_collect(heap, true /* force */);
    page = mi_find_page(heap, size, huge_alignment);
  }
@ -988,6 +940,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al

  mi_assert_internal(mi_page_immediate_available(page));
  mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);

  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
  if mi_unlikely(zero && mi_page_is_huge(page)) {
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@ -239,6 +239,9 @@ void _mi_prim_thread_done_auto_done(void) {

 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
-
 }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -62,6 +62,7 @@ terms of the MIT license. A copy of the license can be found in the file
  #include <sys/syscall.h>
 #endif

+#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?

 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
@ -147,7 +148,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
    }
    #endif
  }
-  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
  config->has_overcommit = unix_detect_overcommit();
  config->has_partial_free = true;    // mmap can free in parts
  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@ -362,6 +363,9 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);
+  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
+    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
+  }

  *is_zero = true;
  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@ -409,7 +413,7 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
  int err = 0;
  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
  err = unix_madvise(start, size, MADV_DONTNEED);
-  #if !MI_DEBUG && !MI_SECURE
+  #if !MI_DEBUG && MI_SECURE<=2
    *needs_recommit = false;
  #else
    *needs_recommit = true;
@ -479,7 +483,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co
 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
  bool is_large = true;
  *is_zero = true;
-  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
    unsigned long numa_mask = (1UL << numa_node);
    // TODO: does `mbind` work correctly for huge OS pages? should we
@ -886,3 +890,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }

 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@ -277,3 +277,7 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
 }
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -17,6 +17,14 @@ terms of the MIT license. A copy of the license can be found in the file
 // Dynamically bind Windows API points for portability
 //---------------------------------------------

+#if defined(_MSC_VER)
+#pragma warning(disable:28159)  // don't use GetVersion
+#pragma warning(disable:4996)   // don't use GetVersion
+#endif
+
+static DWORD win_major_version = 6;
+static DWORD win_minor_version = 0;
+
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
@ -108,16 +116,25 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 // Initialize
 //---------------------------------------------

+static DWORD win_allocation_granularity = 64*MI_KiB;
+
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
  config->has_overcommit = false;
  config->has_partial_free = false;
  config->has_virtual_reserve = true;
+  // windows version
+  const DWORD win_version = GetVersion();
+  win_major_version = (DWORD)(LOBYTE(LOWORD(win_version)));
+  win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version)));
  // get the page size
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  if (si.dwAllocationGranularity > 0) {
+    config->alloc_granularity = si.dwAllocationGranularity;
+    win_allocation_granularity = si.dwAllocationGranularity;
+  }
  // get virtual address bits
  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
    const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
@ -127,7 +144,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
  ULONGLONG memInKiB = 0;
  if (GetPhysicallyInstalledSystemMemory(&memInKiB)) {
    if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
-      config->physical_memory = memInKiB * MI_KiB;
+      config->physical_memory = (size_t)(memInKiB * MI_KiB);
    }
  }
  // get the VirtualAlloc2 function
@ -175,7 +192,7 @@ int _mi_prim_free(void* addr, size_t size ) {
    // the start of the region.
    MEMORY_BASIC_INFORMATION info = { 0 };
    VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
      errcode = 0;
      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
      if (err) { errcode = GetLastError(); }
@ -203,7 +220,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
  }
  #endif
  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@ -239,7 +256,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
      // success, return the address
      return p;
    }
-    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
              (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
              win_is_out_of_memory_error(GetLastError())) {
      // if committing regular memory and being out-of-memory,
@ -815,3 +832,16 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
    mi_allocator_done();
  }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  #if (MI_ARCH_X64 || MI_ARCH_X86)
+  if (win_major_version >= 6) {
+    // check if this thread belongs to a windows threadpool
+    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
+    _TEB* const teb = NtCurrentTeb();
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
+    return (pool_data != NULL);
+  }
+  #endif
+  return false;
+}
--- a/src/random.c
+++ b/src/random.c
@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
-#include <string.h>       // memset

 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/

-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }

 static void chacha_block(mi_random_ctx_t* ctx)
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
  for (size_t i = 0; i < 4; i++) {
    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
    ctx->input[i] = read32(sigma,i);
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }

 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
  ctx_new->input[12] = 0;
  ctx_new->input[13] = 0;
@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim

 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
  // and do a few randomization steps
  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
  for (uintptr_t i = 0; i < max; i++) {
--- a/src/segment-map.c
+++ b/src/segment-map.c
@ -1,136 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
----------------------------------------------------------- */
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-// Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
-#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
-#elif (MI_INTPTR_SIZE > 4)
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
-#else
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (UINT32_MAX)
-#endif
-
-#define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
-#define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
-#define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
-#define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)
-#define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
-#define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
-
-// A part of the segment map.
-typedef struct mi_segmap_part_s {
-  mi_memid_t memid;
-  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
-} mi_segmap_part_t;
-
-// Allocate parts on-demand to reduce .bss footprint
-static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
-
-static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
-  // note: segment can be invalid or NULL.
-  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  *idx = 0;
-  *bitidx = 0;  
-  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
-  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
-  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
-  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
-
-  // allocate on demand to reduce .bss footprint
-  if (part == NULL) {
-    if (!create_on_demand) return NULL;
-    mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
-    if (part == NULL) return NULL;
-    part->memid = memid;
-    mi_segmap_part_t* expected = NULL;
-    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
-      _mi_os_free(part, sizeof(mi_segmap_part_t), memid);
-      part = expected;
-      if (part == NULL) return NULL;
-    }
-  }
-  mi_assert(part != NULL);
-  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
-  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
-  *idx = bitofs / MI_INTPTR_BITS;
-  *bitidx = bitofs % MI_INTPTR_BITS;
-  return part;
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return;
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
-  if (part == NULL) return NULL;  
-  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
-    return segment; // yes, allocated by us
-  }
-  return NULL;
-}
-
-// Is this a valid pointer in our heap?
-static bool mi_is_valid_pointer(const void* p) {
-  // first check if it is in an arena, then check if it is OS allocated
-  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
-
-void _mi_segment_map_unsafe_destroy(void) {
-  for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) {
-    mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL);
-    if (part != NULL) {
-      _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid);
-    }
-  }
-}
--- a/src/segment.c
+++ b/src/segment.c
--- a/src/static.c
+++ b/src/static.c
@ -20,10 +20,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
+#include "arena-meta.c"
 #include "bitmap.c"
 #include "heap.c"
 #include "init.c"
@ -31,9 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
+#include "page-map.c"
 #include "random.c"
-#include "segment.c"
-#include "segment-map.c"
 #include "stats.c"
 #include "prim/prim.c"
 #if MI_OSX_ZONE
--- a/src/stats.c
+++ b/src/stats.c
@ -19,85 +19,92 @@ terms of the MIT license. A copy of the license can be found in the file
  Statistics operations
 ----------------------------------------------------------- */

-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+  }
+  else {
+    mi_atomic_addi64_relaxed(&stat->freed, -amount);
+  }
 }

 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) stat->peak = stat->current;
+  if (amount > 0) {
+    stat->allocated += amount;
  }
  else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
+    stat->freed += -amount;
  }
 }

+
 // Adjust stats to compensate; for example before committing a range,
-// first adjust downwards with parts that were already committed so 
+// first adjust downwards with parts that were already committed so
 // we avoid double counting.
-static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed(&stat->allocated, amount);
-    mi_atomic_addi64_relaxed(&stat->freed, amount);
-  }
-  else {
-    // don't affect the peak
-    stat->current += amount;    
-    // add to both
+  // adjust atomically
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+}
+
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
+  if (amount == 0) return;
+  stat->current += amount;
+  if (on_alloc) {
    stat->allocated += amount;
-    stat->freed += amount;    
-  }
-}
-
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
  }
  else {
-    stat->count++;
-    stat->total += amount;
+    stat->freed += amount;
  }
 }

-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->count, 1);
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
+}
+
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->count++;
+  stat->total += amount;
+}
+
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, (int64_t)amount);
 }

-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, -((int64_t)amount));
 }

-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, (int64_t)amount);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }

-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, -((int64_t)amount));
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc);
 }
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
+}
+

 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
@ -119,7 +126,6 @@ static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  if (stats==src) return;
-  mi_stat_add(&stats->segments, &src->segments,1);
  mi_stat_add(&stats->pages, &src->pages,1);
  mi_stat_add(&stats->reserved, &src->reserved, 1);
  mi_stat_add(&stats->committed, &src->committed, 1);
@ -128,11 +134,9 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_add(&stats->page_committed, &src->page_committed, 1);

  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
-  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
  mi_stat_add(&stats->threads, &src->threads, 1);

  mi_stat_add(&stats->malloc, &src->malloc, 1);
-  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
  mi_stat_add(&stats->normal, &src->normal, 1);
  mi_stat_add(&stats->huge, &src->huge, 1);
  mi_stat_add(&stats->giant, &src->giant, 1);
@ -146,7 +150,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
  mi_stat_counter_add(&stats->searches, &src->searches, 1);
  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
  mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
@ -165,7 +169,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;
+  char buf[32]; _mi_memzero_var(buf);
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "B");
  const int64_t base = (unit == 0 ? 1000 : 1024);
@ -330,7 +334,7 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {

 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
  // wrap the output function to be line buffered
-  char buf[256];
+  char buf[256]; _mi_memzero_var(buf);
  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
  buffer.buf = buf;
  mi_output_fun* out = &mi_buffered_out;
@ -343,7 +347,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  #endif
  #if MI_STAT
  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
  mi_stat_count_t total = { 0,0,0,0 };
  mi_stat_add(&total, &stats->normal, 1);
  mi_stat_add(&total, &stats->huge, 1);
@ -357,21 +361,24 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
-  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
-  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  //mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
+  mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, "");
  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "-reclaima", out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_free,  "-reclaimf", out, arg);
+  mi_stat_counter_print(&stats->pages_reabandon_full, "-reabandon", out, arg);
+  mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "-waits", out, arg);
  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
-  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
-  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
-  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
-  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+  mi_stat_counter_print(&stats->arena_purges, "-purges", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmap calls", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, " -commit", out, arg);
+  mi_stat_counter_print(&stats->reset_calls, "-reset", out, arg);
+  mi_stat_counter_print(&stats->purge_calls, "-purge", out, arg);
  mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg);
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
@ -399,36 +406,37 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)

 static mi_msecs_t mi_process_start; // = 0

-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
-}
-
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
-  }
+// return thread local stats
+static mi_stats_t* mi_get_tld_stats(void) {
+  return &mi_heap_get_default()->tld->stats;
 }

 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_stats_t* stats = mi_get_tld_stats();
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
+  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }

-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) {
+  if (to != from) {
+    mi_stats_add(to, from);
+    _mi_memzero(from, sizeof(mi_stats_t));
+  }
 }

 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+  _mi_stats_merge_from(&_mi_subproc()->stats, stats);
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  _mi_stats_done( mi_get_tld_stats() );
 }

 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_stats_merge();
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }

 void mi_stats_print(void* out) mi_attr_noexcept {
@ -437,7 +445,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }

 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }


@ -471,11 +479,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {

 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc();
  mi_process_info_t pinfo;
  _mi_memzero_var(pinfo);
  pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
  pinfo.current_rss    = pinfo.current_commit;
  pinfo.peak_rss       = pinfo.peak_commit;
  pinfo.utime          = 0;
@ -483,7 +492,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
  pinfo.page_faults    = 0;

  _mi_prim_process_info(&pinfo);
-  
+
  if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
  if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
  if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -50,7 +50,6 @@ int main() {

  // mi_bins();

-
  void* p1 = malloc(78);
  void* p2 = malloc(24);
  free(p1);
@ -83,7 +82,7 @@ int main() {

 static void invalid_free() {
  free((void*)0xBADBEEF);
-  realloc((void*)0xBADBEEF,10);
+  realloc((void*)0xBADBEEF, 10);
 }

 static void block_overflow1() {
@ -181,7 +180,7 @@ static void test_process_info(void) {
  size_t peak_commit = 0;
  size_t page_faults = 0;
  for (int i = 0; i < 100000; i++) {
-    void* p = calloc(100,10);
+    void* p = calloc(100, 10);
    free(p);
  }
  mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
@ -239,8 +238,8 @@ static void test_heap_walk(void) {
 }

 static void test_canary_leak(void) {
-  char* p = mi_mallocn_tp(char,23);
-  for(int i = 0; i < 23; i++) {
+  char* p = mi_mallocn_tp(char, 22);
+  for (int i = 0; i < 22; i++) {
    p[i] = '0'+i;
  }
  puts(p);
@ -286,15 +285,15 @@ static void test_manage_os_memory(void) {
 static void test_large_pages(void) {
  mi_memid_t memid;

-  #if 0
+#if 0
  size_t pages_reserved;
  size_t page_size;
  uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
  const size_t req_size = pages_reserved * page_size;
-  #else
+#else
  const size_t req_size = 64*MI_MiB;
-  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL);
-  #endif
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL);
+#endif

  p[0] = 1;

@ -317,8 +316,8 @@ static void test_large_pages(void) {
 #if 0
 #include <stdint.h>
 #include <stdbool.h>
+#include <mimalloc/bits.h>

-#define MI_INTPTR_SIZE 8
 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)

 #define MI_BIN_HUGE 100
@ -370,8 +369,6 @@ uint8_t _mi_bsr(uintptr_t x) {
  #endif
 }

-
-
 static inline size_t _mi_wsize_from_size(size_t size) {
  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }
@ -408,7 +405,9 @@ extern inline uint8_t _mi_bin8(size_t size) {
 #endif
    wsize--;
    // find the highest bit
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx; 
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -440,7 +439,9 @@ static inline uint8_t _mi_bin4(size_t size) {
    bin = MI_BIN_HUGE;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
  }
  return bin;
@ -456,7 +457,9 @@ static size_t _mi_binx4(size_t wsize) {
    bin = (uint8_t)wsize;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    if (b <= 1) return wsize;
    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
  }
@ -465,7 +468,9 @@ static size_t _mi_binx4(size_t wsize) {

 static size_t _mi_binx8(size_t bsize) {
  if (bsize<=1) return bsize;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
+  size_t idx;
+  mi_bsr(bsize, &idx);
+  uint8_t b = (uint8_t)idx;
  if (b <= 2) return bsize;
  size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
  return bin;
@ -483,8 +488,10 @@ static inline size_t mi_bin(size_t wsize) {
  }
  else {
    wsize--;
+    assert(wsize>0);
    // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr32((uint32_t)wsize);  // note: wsize != 0
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
+    
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -388,7 +388,7 @@ static void test_mt_shutdown()

 // issue #372
 static void fail_aslr() {
-  size_t sz = (4ULL << 40); // 4TiB
+  uint64_t sz = (4ULL << 40); // 4TiB
  void* p = malloc(sz);
  printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz);
  *(int*)0x5FFFFFFF000 = 0;  // should segfault
--- a/test/test-api.c
+++ b/test/test-api.c
@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)

 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN

 #include "testhelper.h"

@ -169,7 +169,7 @@ int main(void) {
  /*
  CHECK_BODY("malloc-aligned6") {
    bool ok = true;
-    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
      void* ps[8];
      for (int i = 0; i < 8 && ok; i++) {
        ps[i] = mi_malloc_aligned(align*13  // size
@ -186,16 +186,16 @@ int main(void) {
  };
  */
  CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
    mi_free(p);
-    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
  };
  CHECK_BODY("malloc-aligned8") {
    bool ok = true;
    for (int i = 0; i < 5 && ok; i++) {
      int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
      mi_free(p);
    }
    result = ok;
@ -203,7 +203,7 @@ int main(void) {
  CHECK_BODY("malloc-aligned9") { // test large alignments
    bool ok = true;
    void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
    for (int i = 0; i < 28 && ok; i++) {
      int align = (1 << i);
      for (int j = 0; j < 8 && ok; j++) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -40,6 +40,19 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
+#elif 0
+static int THREADS = 4;
+static int SCALE   = 10;
+static int ITER    = 20;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 50;
+static int ITER    = 50;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
+#define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 50;      // scaling factor
@ -50,7 +63,12 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a

 #define STRESS                // undefine for leak test

-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+#ifndef ALLOW_LARGE
+#define ALLOW_LARGE  false
+#endif
+
+static bool   allow_large_objects = ALLOW_LARGE;    // allow very large objects? (set to `true` if SCALE>100)
+
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?

 static bool   main_participates = false;       // main thread participates as a worker too
@ -66,7 +84,7 @@ static bool   main_participates = false;       // main thread participates as a
 #define custom_free(p)        mi_free(p)

 #ifndef NDEBUG
-#define HEAP_WALK             // walk the heap objects?
+#define xHEAP_WALK             // walk the heap objects?
 #endif
 #endif

@ -241,9 +259,21 @@ static void test_stress(void) {
    //mi_debug_show_arenas(true);
    #endif
    #if !defined(NDEBUG) || defined(MI_TSAN)
-    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+    if ((n + 1) % 10 == 0) {
+      printf("- iterations left: %3d\n", ITER - (n + 1));
+      mi_debug_show_arenas(true);
+      //mi_collect(true);
+      //mi_debug_show_arenas(true);
+    }
    #endif
  }
+  // clean up
+  for (int i = 0; i < TRANSFERS; i++) {
+    void* p = atomic_exchange_ptr(&transfer[i], NULL);
+    if (p != NULL) {
+      free_items(p);
+    }
+  }
 }

 #ifndef STRESS
@ -274,6 +304,10 @@ int main(int argc, char** argv) {
  #endif
  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    //mi_option_set(mi_option_purge_delay,10);
+  #endif
+  #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // mi_option_set(mi_option_purge_delay,-1);
  #endif
  #ifndef USE_STD_MALLOC
    mi_stats_reset();
@ -318,7 +352,7 @@ int main(int argc, char** argv) {
  #ifndef NDEBUG
  mi_debug_show_arenas(true);
  mi_collect(true);
-  #endif  
+  #endif
 #endif
  mi_stats_print(NULL);
  //bench_end_program();
@ -341,9 +375,10 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
  thread_entry_fun = fun;
  DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
  HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
+  thandles[0] = GetCurrentThread(); // avoid lint warning
  const size_t start = (main_participates ? 1 : 0);
  for (size_t i = start; i < nthreads; i++) {
-    thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
+    thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]);
  }
  if (main_participates) fun(0); // run the main thread as well
  for (size_t i = start; i < nthreads; i++) {
Author	SHA1	Message	Date
Daan Leijen	e394e340e4	Merge branch 'dev' into dev3	2025-01-03 18:11:11 -08:00
Daan Leijen	e14c8fc795	bump version to 3.0.0	2025-01-03 18:08:34 -08:00
Daan Leijen	07bf4eea26	merge from dev	2025-01-03 18:07:01 -08:00
Daan Leijen	c95d9865a8	merge from dev3-bin	2025-01-03 14:27:18 -08:00
Daan Leijen	03d816d7be	Merge branch 'dev3' into dev3-bin	2025-01-03 14:26:44 -08:00
Daan Leijen	6099f76c8c	nicer logic in free	2025-01-03 14:26:32 -08:00
daanx	b432f77bfc	Merge branch 'dev3' into dev3-bin	2025-01-03 13:50:37 -08:00
daanx	f6c2550eac	fix enable large pages	2025-01-03 13:50:31 -08:00
Daan Leijen	b6adbbca0c	combine flags and xthread_id	2025-01-03 13:15:46 -08:00
Daan Leijen	3c43225c1f	fix initialization warning on gcc	2025-01-03 08:51:02 -08:00
Daan Leijen	281a513642	fix initialization warning on gcc	2025-01-03 08:48:06 -08:00
Daan Leijen	bbd7a492f0	fix signedness warning	2025-01-03 08:46:30 -08:00
Daan Leijen	7e539cc353	Merge branch 'dev3' into dev3-bin	2025-01-03 08:38:45 -08:00
Daan Leijen	2a75500ac2	disable large pages by default	2025-01-03 08:38:36 -08:00
Daan Leijen	4e43ebb496	Merge branch 'dev' into dev3	2025-01-03 08:17:44 -08:00
Daan Leijen	53873df613	Merge branch 'dev3' into dev3-bin	2025-01-02 17:25:49 -08:00
Daan Leijen	211f11218e	merge from dev	2025-01-02 17:25:38 -08:00
Daan Leijen	9363900f75	Merge branch 'dev3-bin' of e:\dev\mimalloc3 into dev3-bin	2025-01-02 15:21:43 -08:00
daanx	ab78d57a84	search size bins from small to large	2025-01-02 15:19:08 -08:00
daanx	d25f714ff5	merge from dev3	2025-01-02 15:06:31 -08:00
daanx	d242e86e74	Merge branch 'dev' into dev3	2025-01-02 15:02:57 -08:00
daanx	34e402e128	fix NX test in try_find_and_clearN	2025-01-02 15:00:17 -08:00
daanx	10b40f90fc	fix scan of NX	2025-01-02 14:59:42 -08:00
Daan Leijen	44264b3d8b	Merge branch 'dev3-bin' of e:\dev\mimalloc3 into dev3-bin	2025-01-02 12:45:38 -08:00
daanx	670ebd0348	merge from dev3; make medium bin larger than other	2025-01-02 12:24:27 -08:00
daanx	5e26ba6fe6	fix debug output	2025-01-02 12:14:12 -08:00
daanx	3933ac9a3f	merge from dev3	2025-01-02 11:54:26 -08:00
daanx	c507ee3d96	make bitmap scan cross bfields for NX; disable the use of large object pages	2025-01-02 11:42:28 -08:00
daanx	ff52ea0553	Merge branch 'dev3' into dev3-bin	2024-12-31 15:11:24 -08:00
daanx	0d302cd174	add comments	2024-12-31 15:11:09 -08:00
Daan Leijen	84f2038a2c	Merge branch 'dev3' into dev3-bin	2024-12-31 14:28:25 -08:00
Daan Leijen	9665d604d3	merge from dev	2024-12-31 14:28:09 -08:00
Daan	9511d09529	add neon version for chunk all_set	2024-12-26 23:51:37 -08:00
Daan	dddcd5de16	add neon version for chunk_is_clear	2024-12-26 23:49:38 -08:00
Daan	82a8b2445e	Merge branch 'dev3' into dev3-bin	2024-12-26 23:12:11 -08:00
Daan	8a4c26377f	add neon code for bit clear	2024-12-26 23:12:03 -08:00
Daan	c9ab24899c	Merge branch 'dev3' into dev3-bin	2024-12-26 11:19:32 -08:00
Daan	e6d9011b9d	Merge branch 'dev' into dev3	2024-12-26 11:19:04 -08:00
daanx	e359e9b12b	merge from dev3	2024-12-26 10:43:10 -08:00
daanx	fb704834c4	Merge branch 'dev3' into dev3-bin	2024-12-26 10:42:35 -08:00
daanx	0a7fd7eb6f	use fixed tls on windows with static linking	2024-12-26 10:42:24 -08:00
daanx	807b5cd342	Merge branch 'dev3' into dev3-bin	2024-12-26 10:38:02 -08:00
daanx	8b6eb4752b	merge from dev, add decl_hidden for better codegen on page_map loading	2024-12-26 10:37:51 -08:00
daanx	f72ac7a5aa	add attr_noexept for better codegen on msvc	2024-12-26 10:28:36 -08:00
daanx	4c5bc125ab	Merge branch 'dev3' into dev3-bin	2024-12-26 10:25:03 -08:00
daanx	b70fd1093a	merge from dev	2024-12-26 10:24:56 -08:00
daanx	2aad74e0c3	Merge branch 'dev3' into dev3-bin	2024-12-26 10:15:38 -08:00
daanx	bec06cfb95	merge from dev	2024-12-26 10:15:08 -08:00
daanx	27e0c467ae	fix c++ initializer warning	2024-12-25 14:56:11 -08:00
Daan Leijen	76d50d4566	Merge branch 'dev3' into dev3-bin	2024-12-25 14:41:43 -08:00
Daan Leijen	efe10513ec	fix initializer warning on clang-18	2024-12-25 14:40:32 -08:00
daanx	a245135d89	Merge branch 'dev3' into dev3-bin	2024-12-25 14:12:52 -08:00
daanx	5f13941c18	fix constructor re-initialization on subproc_main	2024-12-25 14:12:45 -08:00
daanx	c65d5b878b	Merge branch 'dev3' into dev3-bin	2024-12-25 13:30:50 -08:00
daanx	7ae726bb39	small fixes	2024-12-25 13:30:42 -08:00
daanx	b5c4a3c6e7	merge from dev3	2024-12-25 11:47:54 -08:00
daanx	8339cefdeb	fix stats for delay purge commit	2024-12-25 11:45:01 -08:00
daanx	15061be4b2	commit page-map within one allocation	2024-12-25 10:50:49 -08:00
daanx	ce7eb4db7a	fix page commit-on-demand setting	2024-12-25 10:49:49 -08:00
daanx	5a663da9aa	fix build warning	2024-12-24 20:38:36 -08:00
daanx	e64d6fcc47	Merge branch 'dev3' into dev3-bin	2024-12-24 20:23:47 -08:00
daanx	24b8384f80	remove is_expandable requirement on page candidates	2024-12-24 20:23:37 -08:00
daanx	a65742fdf9	merge from dev3	2024-12-24 20:21:56 -08:00
daanx	8259c0eb7c	nice colors for heap maps	2024-12-24 20:10:44 -08:00
daanx	50d22cf092	Merge branch 'dev3' into dev3-bin	2024-12-24 17:15:01 -08:00
daanx	4d1d3471cf	rename page options	2024-12-24 17:14:53 -08:00
daanx	fe8e52cbcc	Merge branch 'dev3' into dev3-bin	2024-12-24 17:07:19 -08:00
daanx	1e1a12bf3c	fix rounding issue with huge size allocations	2024-12-24 17:07:11 -08:00
daanx	6f6190c8a9	Merge branch 'dev3' into dev3-bin	2024-12-24 16:40:02 -08:00
daanx	d862e57955	fix huge page allocation size	2024-12-24 16:39:54 -08:00
daanx	e078879825	Merge branch 'dev3' into dev3-bin	2024-12-24 15:00:14 -08:00
daanx	ad6f48f3e4	fix assertion for huge pages	2024-12-24 15:00:05 -08:00
Daan Leijen	431370df62	Merge branch 'dev3' into dev3-bin	2024-12-24 12:10:46 -08:00
Daan Leijen	016b36d917	fix max va bits on unix	2024-12-24 12:10:34 -08:00
Daan Leijen	71a1645d4d	fix build	2024-12-24 12:04:21 -08:00
daanx	7c331a967b	merge from dev3	2024-12-24 11:42:02 -08:00
daanx	d21114b5f2	improve page commit on demand	2024-12-24 11:37:52 -08:00
daanx	ba68810333	commit page on demand	2024-12-23 18:33:37 -08:00
daanx	9a7c0d443a	max obj size 1/8 of a page	2024-12-23 17:15:13 -08:00
daanx	b77b34df96	double arena per 4; large page objects 1/8 of large page size	2024-12-23 17:10:34 -08:00
daanx	3fa3476712	Merge branch 'dev3' into dev3-bin	2024-12-23 16:47:08 -08:00
daanx	9bad269c51	fix purge delay check for arenas	2024-12-23 16:47:01 -08:00
daanx	c65c6d83bd	fix guard page size	2024-12-23 16:31:42 -08:00
daanx	b515a0ad4c	add _mi_os_guard_page_size	2024-12-23 16:28:34 -08:00
daanx	88d8ee964f	remove is_large member (and use is_pinned for this)	2024-12-23 15:04:06 -08:00
daanx	657135de36	commit 2level page-map on over-commit systems	2024-12-23 09:53:52 -08:00
daanx	da2ab86e9f	Merge branch 'dev3' into dev3-bin	2024-12-22 22:31:26 -08:00
daanx	bc5ae31649	add abandoned_visit_blocks	2024-12-22 22:31:16 -08:00
daanx	04970f43e5	document way to use a TLS slot on windows	2024-12-22 21:55:40 -08:00
daanx	dd1b37c9f8	fix recursive tls access on macOS <= 14	2024-12-22 21:03:03 -08:00
daanx	8d2b7b0383	merge from dev3	2024-12-22 18:34:39 -08:00
daanx	36bf7dfc45	Merge branch 'dev3' into dev3-bin	2024-12-22 18:33:56 -08:00
daanx	f605cb73e5	old purge delay	2024-12-22 18:33:44 -08:00
daanx	823f5b7ecd	merge from dev3	2024-12-22 18:32:47 -08:00
daanx	e61ab67185	cleanup	2024-12-22 18:31:33 -08:00
daanx	1eea4309b6	Merge branch 'dev3' into dev3-bin	2024-12-22 18:09:27 -08:00
daanx	db82baf1a8	cleanup, some renaming	2024-12-22 18:09:16 -08:00
daanx	9ecadaecd5	clean up	2024-12-22 17:55:56 -08:00
daanx	b920fc1b72	merge from dev3	2024-12-22 17:38:48 -08:00
daanx	773fe7ae5b	support full secure build	2024-12-22 17:25:58 -08:00
daanx	516e644359	rename option pagemap_commit; always commit the page map on macos (for now)	2024-12-22 16:06:49 -08:00
daanx	6b97830f6a	merge from dev3	2024-12-22 14:40:46 -08:00
daanx	c5cfc92f0c	small fixes	2024-12-22 14:39:57 -08:00
daanx	a42a2a926b	improving level 2 page-map	2024-12-22 14:18:33 -08:00
daanx	3c7d7e1f11	experiment with 2 level pagemap	2024-12-22 14:07:57 -08:00
daanx	8d16303aa6	add -mtune=native with opt arch	2024-12-22 12:21:31 -08:00
daanx	93fa8d895a	revert back to flat address map	2024-12-22 12:18:53 -08:00
daanx	c9b2d31665	fix page_map initialization	2024-12-21 23:17:11 -08:00
daanx	56cbddfc7e	initial work on a two-level page-map	2024-12-21 23:08:52 -08:00
daanx	1e2221f512	fix signed/unsigned; fix heap_destroy assert failure	2024-12-21 19:28:53 -08:00
daanx	bfc498e54a	Merge branch 'dev3' into dev3-bin	2024-12-21 16:25:04 -08:00
daanx	d7d626cbfa	enable collecting from the full page queue	2024-12-21 16:24:56 -08:00
daanx	b991510813	merge from dev3	2024-12-21 15:56:22 -08:00
daanx	da17a59bdb	re-add deferred free and heap retired collect	2024-12-21 15:53:50 -08:00
daanx	5de5550c63	merge from dev3	2024-12-21 15:52:15 -08:00
daanx	c138fba149	merge from dev	2024-12-21 15:49:17 -08:00
daanx	1a6fbdf0b2	merge from dev	2024-12-21 15:48:49 -08:00
daanx	108c84e858	remove req_arena parameter to arena_reserve	2024-12-21 14:45:14 -08:00
daanx	7d46478a5f	add initial load/unload for heaps	2024-12-21 13:19:06 -08:00
daanx	89b0d5a357	allocate heaps associated with an arena in that arena	2024-12-21 11:53:29 -08:00
daanx	4ad7fedd25	track os abandoned pages in a list	2024-12-21 11:35:30 -08:00
daanx	95aeda4cdd	merge subproc stats on delete	2024-12-21 10:53:34 -08:00
daanx	dece8a587b	make stats part of a subproc	2024-12-21 10:43:08 -08:00
daanx	daac75af36	fix lock recursion	2024-12-20 22:13:58 -08:00
daanx	a5b7d7f264	subprocesses own arena's	2024-12-20 21:38:31 -08:00
daanx	53857ddaa3	Merge branch 'dev' into dev3	2024-12-20 17:32:32 -08:00
daanx	7141d9f164	remove busy wait for arena reservation	2024-12-20 17:31:48 -08:00
daanx	bc459b5e16	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-20 16:46:18 -08:00
Daan Leijen	278f1ff556	merge from dev; match test-stress	2024-12-20 14:00:02 -08:00
daanx	b2d1b4c472	Merge branch 'dev3-bin' of https://github.com/microsoft/mimalloc into dev3-bin	2024-12-20 13:10:55 -08:00
daanx	efa82e1c7d	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-20 13:10:16 -08:00
Daan Leijen	f0f4c9c009	Merge branch 'dev3' into dev3-bin	2024-12-20 13:07:00 -08:00
Daan Leijen	7822438561	merge from dev	2024-12-20 13:06:46 -08:00
Daan Leijen	4322546a9b	Merge branch 'dev3' into dev3-bin	2024-12-20 13:01:09 -08:00
Daan Leijen	f6408235f7	merge from dev	2024-12-20 13:01:00 -08:00
Daan Leijen	13a58ac343	Merge branch 'dev3' into dev3-bin	2024-12-20 11:56:16 -08:00
Daan Leijen	5614c5052e	don't prefer high used candidate if it is too full	2024-12-20 11:56:04 -08:00
Daan Leijen	2db407d1e9	revert back to generating mimalloc.dll instead of mimalloc-override.dll	2024-12-20 11:54:39 -08:00
daanx	3746bf79ed	small fixes; max object size 1/8th of a pages	2024-12-19 21:30:03 -08:00
daanx	9a4c264e76	Merge branch 'dev3' into dev3-bin	2024-12-19 19:18:10 -08:00
daanx	de8001c107	add specialized is_set for 1 bit	2024-12-19 19:18:04 -08:00
daanx	8dd605099b	fix arm64ec asm	2024-12-19 15:29:40 -08:00
daanx	02b59e0f15	Merge branch 'dev3' into dev3-bin	2024-12-19 11:01:12 -08:00
daanx	b18e1546a7	merge from dev	2024-12-18 15:59:33 -08:00
daanx	2d679959b7	Merge branch 'dev3' into dev3-bin	2024-12-17 19:13:14 -08:00
daanx	264d5a6704	update stat adjustment for purging	2024-12-17 19:13:03 -08:00
daanx	fb90938408	adjust stats more clearly to avoid double counting commits	2024-12-17 19:11:23 -08:00
daanx	2a3969ffc7	Merge branch 'dev3' into dev3-bin	2024-12-17 18:57:20 -08:00
Daan Leijen	58b726be6f	better stats for commit on overcommit systems (by not counting on-demand commit upfront)	2024-12-17 18:57:00 -08:00
daanx	587eabe72b	Merge branch 'dev3' into dev3-bin	2024-12-17 18:10:37 -08:00
daanx	84bb1c2712	adjust stats more clearly to avoid double counting commits	2024-12-17 18:10:28 -08:00
daanx	21c05019b7	Merge branch 'dev' into dev3	2024-12-17 17:54:24 -08:00
daanx	34d03f3981	atomically clear purge bits when visiting	2024-12-17 12:32:18 -08:00
daanx	6e2a64b81e	merge from dev3	2024-12-17 11:58:02 -08:00
daanx	c585753dce	fix purging with ranges	2024-12-17 11:54:26 -08:00
daanx	68a90ceb9a	add ranges for purging	2024-12-17 11:44:14 -08:00
daanx	adfeb1f6f2	fix bug in bitmap_forall_ranges	2024-12-17 10:43:31 -08:00
daanx	fdad1a0d4f	fix infoslices needed calculation	2024-12-17 09:49:09 -08:00
Daan Leijen	98171fd80a	testing on arm64	2024-12-17 00:24:32 -08:00
Daan Leijen	d4a2813ff8	Merge branch 'dev3' into dev3-bin	2024-12-17 00:17:32 -08:00
Daan Leijen	63d0c8f861	merge from dev	2024-12-17 00:14:03 -08:00
daanx	d9397be178	comments	2024-12-16 10:00:32 -08:00
daanx	037cb167f8	comments	2024-12-16 09:51:54 -08:00
daanx	d2f670e6e5	add delay to purg'ing; call collect_retired every N generic allocs	2024-12-15 19:54:01 -08:00
daanx	3330d4353a	remove maxaccessed from general bitmaps	2024-12-15 19:15:00 -08:00
daanx	e24217e69c	more bbin size classes, bug fixes	2024-12-15 18:35:12 -08:00
daanx	df9009a060	wip: binned bitmap for the free slices	2024-12-15 17:15:56 -08:00
daanx	3153e5a4c5	small fixes	2024-12-15 13:47:33 -08:00
daanx	13ee94cef6	fix concurrent mi_tld access bug	2024-12-15 13:22:00 -08:00
daanx	4aeb2e1005	flexible clearN_ that can start at any index	2024-12-15 13:21:13 -08:00
daanx	b5dfd233e9	fix avx2 bug with atomics	2024-12-13 19:59:08 -08:00
daanx	216c04f8d9	clean up bitmap api	2024-12-13 18:39:03 -08:00
daanx	4c81c3cf90	enable purging of free committed slices from arenas	2024-12-13 13:17:00 -08:00
daanx	42af184ce9	wip: start on purge	2024-12-13 09:04:23 -08:00
daanx	ba39e4d65b	wip: start on purge	2024-12-13 09:03:17 -08:00
Daan	3010d5890f	fix assertion	2024-12-12 20:27:46 -08:00
daanx	e43eb1f191	nicer debug output	2024-12-12 20:22:24 -08:00
daanx	b53ac835f1	comment	2024-12-12 20:01:37 -08:00
daanx	623eaedf33	add debug output for page map; free tld on thread exit	2024-12-12 19:59:54 -08:00
daanx	637de624b3	fix free bug for meta data	2024-12-12 19:55:45 -08:00
daanx	d5c4a16e58	lower full page retain more aggressively in a threadpool	2024-12-12 17:57:36 -08:00
daanx	df956c4a17	use thread spacing for reclaim as well	2024-12-12 17:22:41 -08:00
daanx	98879ac8bc	use thread spacing for reclaim as well	2024-12-12 17:22:00 -08:00
daanx	118bd8c97f	space out threads when searching for free pages	2024-12-12 16:37:31 -08:00
daanx	94ce342ea9	maintain pages set for arenas; improve arena load/unload	2024-12-11 22:06:25 -08:00
daanx	aed76f2910	wip: allow arena (re)loading	2024-12-11 20:34:23 -08:00
daanx	ccf5e36e6b	use frac 8 for reclaim_on_free and reabandon; halve full_page_retain if running in a threadpool	2024-12-11 16:26:39 -08:00
daanx	1c8d15abac	fix build error	2024-12-11 14:30:44 -08:00
daanx	ab53a73cbd	small updates	2024-12-11 14:29:06 -08:00
daanx	565656919e	fix comments in types; fix guarded alignment bug	2024-12-11 13:04:37 -08:00
daanx	64eea823e4	use always abandon on heap delete	2024-12-11 09:24:38 -08:00
daanx	24d3c1bc14	heap meta data always uses mi_meta_zalloc	2024-12-11 09:16:28 -08:00
daanx	6774130c9a	Merge ..\mimalloc into dev3	2024-12-10 20:46:12 -08:00
daanx	64c4181ffa	better block alignment	2024-12-10 20:32:48 -08:00
daanx	c478ddaab4	fix MI_GUARDED build	2024-12-10 19:44:54 -08:00
daanx	2a1c346281	Merge branch 'dev3' of https://github.com/microsoft/mimalloc into dev3	2024-12-10 15:12:13 -08:00
Daan	13be5d6740	use non-null tld in heap_init	2024-12-10 15:11:46 -08:00
daanx	7cd8f31f30	improve popcount	2024-12-10 14:50:55 -08:00
Daan	f37aff6ee2	fix for macOS 14 and earlier	2024-12-09 22:27:40 -08:00
Daan	6798375f47	temporarily add macOS 13 and 12 for testing	2024-12-09 21:26:23 -08:00
Daan	5e434a6e66	merge from dev	2024-12-09 21:24:30 -08:00
daanx	c5a2d11193	add extra checks for valid pointers in the pagemap, add max_vabits and debug_commit_full_pagemap options	2024-12-09 20:40:26 -08:00
daanx	3a92c35270	improve generic ctz/clz	2024-12-09 20:25:22 -08:00
daanx	e44815ed6f	add bsf/bsr for compilation with older compilers (clang 7)	2024-12-09 20:06:48 -08:00
daanx	56a1bd7f9e	fix 32 bit multiply in generic ctz/clz	2024-12-09 19:43:00 -08:00
daanx	f28d5c7029	add cast to avoid errors on clang 7	2024-12-09 19:12:03 -08:00
daanx	bbcbd3cd1f	add cast to avoid errors on clang 7	2024-12-09 19:06:06 -08:00
Daan	3f732a981f	fix debug build of MI_GUARDED	2024-12-09 15:49:20 -08:00
Daan	8f5449d271	various fixes for test pipeline	2024-12-09 15:39:15 -08:00
Daan	351cb0c740	small fixes for macOS	2024-12-09 15:16:36 -08:00
daanx	d5ed0cc71e	various improvements	2024-12-09 14:31:43 -08:00
daanx	68ac94c1ba	set default arena reserve back to 1GiB	2024-12-08 18:53:43 -08:00
daanx	bf2f2a8bf4	fix bug where only the first chunkmap field would be considered	2024-12-08 18:48:56 -08:00
daanx	88990cec2d	merge from dev	2024-12-08 18:27:05 -08:00
daanx	2a4af6f169	comments	2024-12-08 17:21:17 -08:00
daanx	2084df3dde	add dedicated meta data allocation for threads and tld	2024-12-08 12:20:54 -08:00
daanx	67cc424ada	delete old files	2024-12-08 09:19:05 -08:00
daanx	36bb599873	merge from dev	2024-12-08 09:15:09 -08:00
daanx	2ed6e03d27	update optimization on haswell	2024-12-08 09:14:16 -08:00
daanx	e446bc27e5	Merge ..\mimalloc into dev3	2024-12-08 09:03:33 -08:00
daanx	5a06d2aeba	update bit primitives	2024-12-08 09:03:25 -08:00
daanx	c33de86da3	check for running in a threadpool to disable page reclaim	2024-12-07 17:11:11 -08:00
daanx	d0c86f3f0e	specialize bitmap operations for common page sizes	2024-12-07 16:26:07 -08:00
daanx	bf42759d97	check heaptag on abandonded page allocation	2024-12-07 15:13:17 -08:00
daanx	6b52b19e3b	arch specific optimizations	2024-12-07 15:02:27 -08:00
daanx	0e5d5831e4	Merge ..\mimalloc into dev3	2024-12-07 14:17:05 -08:00
daanx	bef52b96f6	Merge ../mimalloc into dev3	2024-12-07 14:04:02 -08:00
daanx	9631b0d4d2	revise visiting arenas, better bitmap scanning	2024-12-07 14:03:51 -08:00
daanx	70115d8b8c	small fixes	2024-12-06 23:25:53 -08:00
daanx	bf9a2ddb59	compile for 32-bit as well	2024-12-06 23:07:10 -08:00
daanx	659a9dd51d	fix page info size and order; atomic page flags	2024-12-06 22:37:59 -08:00
daanx	5a5943ad33	record max_clear bit	2024-12-06 21:03:33 -08:00
daanx	61436a92b9	working simplified version without pairmaps and bitmap epoch	2024-12-06 15:26:01 -08:00
daanx	ec9c61c066	initial no more pairmap	2024-12-06 14:53:24 -08:00
daanx	7443ee317e	tune free-ing and abandoning	2024-12-05 17:00:23 -08:00
daanx	0616ee151e	change to full_page_retain	2024-12-05 11:29:25 -08:00
daanx	bc67be4d79	small adjustments	2024-12-04 21:40:57 -08:00
daanx	afe9089152	more documentation; better pairmap find_and_set_to_busy, busy flag is now 0x10	2024-12-04 19:15:55 -08:00
daanx	45f7fb559a	small fixes	2024-12-04 00:14:56 -08:00
daanx	bc7fe399b1	large bitmaps working; lock on arena_reserve	2024-12-03 23:35:33 -08:00
daanx	e5fdd6e110	wip: initial large bitmaps	2024-12-03 22:43:14 -08:00
daanx	8d9c725482	increase MAX_OBJ_SLICES to a full chunk (32MiB)	2024-12-03 17:27:43 -08:00
daanx	3fc2c8e279	fix assertions	2024-12-03 11:06:07 -08:00
daanx	666c089fc8	revise free reclaim; ensure unown cannot race with a free	2024-12-03 10:51:13 -08:00
daanx	833b091ff9	can run the full test suite	2024-12-02 20:25:44 -08:00
daanx	bd5f7de3f4	can run basic test	2024-12-02 20:21:35 -08:00
daanx	fe5a314114	add base and size to OS memid	2024-12-02 19:31:36 -08:00
daanx	5e95ebc7a0	fix free stats	2024-12-02 17:46:41 -08:00
daanx	c9abfe8253	wip: can run mstress	2024-12-02 16:24:40 -08:00
daanx	d96c134566	wip: initial version with eager abandonment	2024-12-02 16:01:45 -08:00
daanx	69ac69abac	wip: use epoch with 512bit chunks	2024-12-02 00:31:08 -08:00
daanx	2f789aae9a	wip: cannot compile	2024-12-01 16:26:59 -08:00
daanx	1d7a9f62a5	bug fixes	2024-12-01 12:54:16 -08:00
daanx	8f2a5864b8	pass all debug tests	2024-11-30 22:54:57 -08:00
daanx	9ebe941ce0	first version that passes the make test	2024-11-30 20:21:32 -08:00
daanx	55b70f1588	wip	2024-11-30 14:00:07 -08:00
daanx	f8d04dc2bc	compile with clang and gcc	2024-11-30 12:41:11 -08:00
daanx	d15e83030e	wip: rename arena blocks to slices	2024-11-30 12:16:41 -08:00
daanx	309fc26b4b	wip: add generic find_and_xset	2024-11-30 12:00:30 -08:00
daanx	188294a0df	wip: bug fixes	2024-11-30 11:12:39 -08:00
daanx	9d904e8643	wip: bug fixes	2024-11-30 10:39:30 -08:00
daanx	978d844e15	wip: bug fixes	2024-11-29 20:23:39 -08:00
daanx	0f635413d6	wip: can run initial test	2024-11-29 17:50:37 -08:00
daanx	e0152ab82f	wip: update any_set	2024-11-29 16:58:52 -08:00
daanx	9603fe8b50	can compile without missing functions	2024-11-29 16:27:58 -08:00
daanx	68f5fb2f4b	wip: further progress on segment removal; arena allocation	2024-11-29 15:08:06 -08:00
daanx	46afcbe06c	wip: further progress on segment removal; arena allocation	2024-11-29 14:28:34 -08:00
daanx	441d4fed9f	wip: further progress on removing segments	2024-11-29 10:40:18 -08:00
daanx	71cfa45e76	wip: initial work on mimalloc3 without segments	2024-11-28 19:31:04 -08:00