Merge branch 'dev3-bin-dbg' of https://github.com/microsoft/mimalloc into dev3-bin-dbg

2025-08-23 15:54:47 +03:00 · 2025-03-05 16:27:42 -05:00 · 2025-03-05 16:27:42 -05:00 · 71549dae90
commit 71549dae90
parent 438903421a fcc76cb95c
51 changed files with 7111 additions and 4983 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,31 +10,38 @@ option(MI_PADDING           "Enable padding to detect heap block overflow (alway
 option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
-option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
-option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
-option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
-option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON)
+
+option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
+option(MI_OPT_SIMD          "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
 option(MI_WIN_USE_FIXED_TLS "Use a fixed TLS slot on Windows to avoid extra tests in the malloc fast path" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
-option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
+option(MI_LIBC_MUSL         "Enable this when linking with musl libc" OFF)
+
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
+option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
+option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
+
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
-option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
-option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
+
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
 option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
 option(MI_EXTRA_CPPDEFS     "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "")

+option(MI_WIN_DBG_EXTS      "Build with windows debugger extension points")
+
 # negated options for vcpkg features
 option(MI_NO_USE_CXX        "Use plain C compilation (has priority over MI_USE_CXX)" OFF)
 option(MI_NO_OPT_ARCH       "Do not use architecture specific optimizations (like '-march=armv8.1-a' for example) (has priority over MI_OPT_ARCH)" OFF)
@ -54,6 +61,7 @@ set(mi_sources
    src/alloc-aligned.c
    src/alloc-posix.c
    src/arena.c
+    src/arena-meta.c
    src/bitmap.c
    src/heap.c
    src/init.c
@ -61,12 +69,15 @@ set(mi_sources
    src/options.c
    src/os.c
    src/page.c
+    src/page-map.c
    src/random.c
-    src/segment.c
-    src/segment-map.c
    src/stats.c
    src/prim/prim.c)

+if(WIN32 AND MI_WIN_DBG_EXTS)
+  list(APPEND mi_sources src/prim/windows/windbg/mimalloc_dbg.cpp)
+endif()
+
 set(mi_cflags "")
 set(mi_cflags_static "")            # extra flags for a static library build
 set(mi_cflags_dynamic "")           # extra flags for a shared-object library build
@ -155,8 +166,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
  if (NOT MI_OPT_ARCH)
    message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
  endif()
-else()
-  set(MI_OPT_ARCH OFF)
+#else()
+#  set(MI_OPT_ARCH OFF)
 endif()

 if(MI_OVERRIDE)
@ -248,6 +259,17 @@ if(MI_TRACK_ETW)
  endif()
 endif()

+if(MI_WIN_DBG_EXTS)
+  if(NOT WIN32)
+    set(MI_WIN_DBG_EXTS OFF)
+    message(WARNING "Can only enable Windows debbuger extension support on Windows (MI_WIN_DBG_EXTS=OFF)")
+  endif()
+  if(MI_WIN_DBG_EXTS)
+    message(STATUS "Compile with Windows debbuger extension support (MI_WIN_DBG_EXTS=ON)")
+    list(APPEND mi_defines MI_WIN_DBG_EXTS=1)
+  endif()
+endif()
+
 if(MI_GUARDED)
  message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)")
  list(APPEND mi_defines MI_GUARDED=1)
@ -260,7 +282,7 @@ endif()
 if(MI_SEE_ASM)
  message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
  list(APPEND mi_cflags -save-temps)
-  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14)
    message(STATUS "No GNU Line marker")
    list(APPEND mi_cflags -Wno-gnu-line-marker)
  endif()
@ -431,11 +453,15 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
      list(APPEND mi_cflags -ftls-model=initial-exec)
    endif()
  endif()
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
  if(MI_OVERRIDE)
    list(APPEND mi_cflags -fno-builtin-malloc)
  endif()
 endif()

+# Compiler and architecture specific flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
  if(MI_OPT_ARCH)
    if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
@ -443,17 +469,24 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
      if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
      endif()
+      if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_x86_64;-march=haswell;-Xarch_x86_64;-mavx2")
+      endif()
+    elseif(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2")    # fast bit scan (since 2013)
    elseif(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")  # fast atomics
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics (since 2016)
    endif()
  endif()
 endif()

-if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
  list(APPEND mi_cflags /Zc:__cplusplus)
  if(MI_OPT_ARCH AND NOT MI_CLANG_CL)
-    if(MI_ARCH STREQUAL "arm64")
-      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics
+    if(MI_ARCH STREQUAL "x64")
+      set(MI_OPT_ARCH_FLAGS "/arch:AVX2")
+    elseif(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")
    endif()
  endif()
 endif()
@ -465,6 +498,12 @@ endif()
 if(MI_OPT_ARCH_FLAGS)
  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
+  if (MI_OPT_SIMD)
+    list(APPEND mi_defines "MI_OPT_SIMD=1")
+    message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)")
+  endif()
+elseif(MI_OPT_SIMD)
+  message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)")
 endif()

 # extra needed libraries
@ -490,6 +529,9 @@ endfunction()

 if(WIN32)
  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
+  if(MI_WIN_DBG_EXTS)
+    list(APPEND mi_libraries dbgeng) # todo: only for the dll?
+  endif()
 else()
  find_link_library("pthread" MI_LIB_PTHREAD)
  if(MI_LIB_PTHREAD)
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -30,6 +30,10 @@ jobs:
        BuildType: release
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
        MSBuildConfiguration: Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON
+        MSBuildConfiguration: Release
      Secure:
        BuildType: secure
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
@ -89,6 +93,11 @@ jobs:
        CXX: clang++
        BuildType: release-clang
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-simd-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
      Secure Clang:
        CC: clang
        CXX: clang++
@ -148,6 +157,9 @@ jobs:
      Release:
        BuildType: release
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Release SIMD:
+        BuildType: release-simd
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
      Secure:
        BuildType: secure
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
@ -263,3 +275,28 @@ jobs:
  - script: ctest --verbose --timeout 240
    workingDirectory: $(BuildType)
    displayName: CTest
+
+- job:
+  displayName: macOS 13 (Ventura)
+  pool:
+    vmImage:
+      macOS-13
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 180
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
--- a/bin/readme.md
+++ b/bin/readme.md
@ -63,7 +63,7 @@ need a specific redirection DLL:
  mode on Windows arm64. Unfortunately we cannot run x64 code emulated on Windows arm64 with
  the x64 mimalloc override directly (since the C runtime always uses `arm64ec`). Instead:
  1. Build the program as normal for x64 and link as normal with the x64 
-     `mimalloc.lib` export library.
+     `mimalloc.dll.lib` export library.
  2. Now separately build `mimalloc.dll` in `arm64ec` mode and _overwrite_ your
     previous (x64) `mimalloc.dll` -- the loader can handle the mix of arm64ec
     and x64 code. Now use `mimalloc-redirect-arm64ec.dll` to match your new
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -1,5 +1,5 @@
-set(mi_version_major 1)
-set(mi_version_minor 9)
+set(mi_version_major 3)
+set(mi_version_minor 0)
 set(mi_version_patch 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})

--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@ -5,11 +5,11 @@ vcpkg_from_github(

  # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1).
  # REF "v${VERSION}"
-  REF 866ce5b89db1dbc3e66bbf89041291fd16329518
+  REF 6a89f8554eaab8d8d00e17b5b09f79e1d8dbf61b

  # The sha512 is the hash of the tar.gz bundle.
  # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=<dir of this file>` and copy the sha from the error message.)
-  SHA512 0b0e5ff823c49b9534b8c32800679806c5d7c29020af058da043c3e6e36ae3c32a1cdd5a21ece97dd60bc7dd4703967f683beac435dbb8514638a6cc55e5dea8
+  SHA512 32b87a3195efcc558b83a546348a8fb544fed335cdd6c9f8e7e9d0e8e64540fdcf1f4aa57fd0e783b78731518f4810292b832227d7e7665bf8426f1e6ce96f9d
 )

 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@ -1,6 +1,6 @@
 {
  "name": "mimalloc",
-  "version": "1.9.2",
+  "version": "3.0.2",
  "port-version": 2,
  "description": "Compact general purpose allocator with excellent performance",
  "homepage": "https://github.com/microsoft/mimalloc",
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@ -431,12 +431,11 @@ int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large);
 /// @param start       Start of the memory area
 /// @param size        The size of the memory area.
 /// @param is_committed Is the area already committed?
-/// @param is_large    Does it consist of large OS pages? Set this to \a true as well for memory
-///                    that should not be decommitted or protected (like rdma etc.)
+/// @param is_pinned   Can the memory not be decommitted or reset? (usually the case for large OS pages)
 /// @param is_zero     Does the area consists of zero's?
 /// @param numa_node   Possible associated numa node or `-1`.
 /// @return \a true if successful, and \a false on error.
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node);
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node);

 /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes,
 /// but stops after at most `timeout_msecs` seconds.
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@ -178,6 +178,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Lib>
      <AdditionalLibraryDirectories>
@ -197,6 +198,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <PostBuildEvent>
      <Command>
@ -209,8 +211,7 @@
    <Lib>
      <AdditionalLibraryDirectories>
      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
+      <AdditionalDependencies>dbgeng.lib</AdditionalDependencies>
    </Lib>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
@ -224,6 +225,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <PostBuildEvent>
      <Command>
@ -251,6 +253,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <PostBuildEvent>
      <Command>
@ -283,6 +286,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -311,6 +315,8 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -347,6 +353,7 @@
      <LanguageStandard>stdcpp20</LanguageStandard>
      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
      <ExceptionHandling>Sync</ExceptionHandling>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -383,6 +390,7 @@
      <LanguageStandard>stdcpp20</LanguageStandard>
      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
      <ExceptionHandling>Sync</ExceptionHandling>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -424,16 +432,7 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandon.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c" />
    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\bitmap.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
@ -453,6 +452,7 @@
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -476,9 +476,8 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp" />
    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\os.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
@ -488,11 +487,13 @@
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-stats.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
+    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
    <ClInclude Include="..\..\include\mimalloc\types.h" />
    <ClInclude Include="..\..\src\bitmap.h" />
+    <ClInclude Include="..\..\src\prim\windows\windbg\mimalloc_dbg.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/ide/vs2022/mimalloc-lib.vcxproj.filters
+++ b/ide/vs2022/mimalloc-lib.vcxproj.filters
@ -52,16 +52,16 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandon.c">
+    <ClCompile Include="..\..\src\page-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp">
      <Filter>Sources</Filter>
    </ClCompile>
  </ItemGroup>
@ -93,6 +93,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\bits.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-stats.h">
      <Filter>Headers</Filter>
    </ClInclude>
--- a/ide/vs2022/mimalloc-override-dll.vcxproj
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj
@ -174,6 +174,7 @@
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
      <SupportJustMyCode>false</SupportJustMyCode>
      <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -204,9 +205,10 @@
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
      <SupportJustMyCode>false</SupportJustMyCode>
      <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;dbgeng.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <IgnoreSpecificDefaultLibraries>
      </IgnoreSpecificDefaultLibraries>
      <ModuleDefinitionFile>
@ -234,6 +236,7 @@
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
      <SupportJustMyCode>false</SupportJustMyCode>
      <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -264,6 +267,7 @@
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
      <SupportJustMyCode>false</SupportJustMyCode>
      <CompileAs>CompileAsCpp</CompileAs>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -298,6 +302,7 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <CompileAs>CompileAsCpp</CompileAs>
      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -332,6 +337,7 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
      <CompileAs>CompileAsCpp</CompileAs>
      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -367,6 +373,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -402,6 +409,7 @@
      <CompileAs>CompileAsCpp</CompileAs>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
@ -423,17 +431,17 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="..\..\include\mimalloc-etw-gen.h" />
-    <ClInclude Include="..\..\include\mimalloc-etw.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc-stats.h" />
    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
+    <ClInclude Include="..\..\include\mimalloc\bits.h" />
    <ClInclude Include="..\..\include\mimalloc\internal.h" />
    <ClInclude Include="..\..\include\mimalloc\prim.h" />
    <ClInclude Include="..\..\include\mimalloc\track.h" />
    <ClInclude Include="..\..\include\mimalloc\types.h" />
    <ClInclude Include="..\..\src\bitmap.h" />
+    <ClInclude Include="..\..\src\prim\windows\windbg\mimalloc_dbg.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\..\src\alloc-aligned.c">
@ -458,7 +466,10 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandon.c">
+    <ClCompile Include="..\..\src\arena-meta.c" />
+    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.c" />
+    <ClCompile Include="..\..\src\free.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -468,11 +479,10 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
    <ClCompile Include="..\..\src\prim\prim.c" />
    <ClCompile Include="..\..\src\prim\windows\prim.c">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@ -497,9 +507,8 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\..\src\page.c" />
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp" />
    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
    <ClCompile Include="..\..\src\stats.c" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ide/vs2022/mimalloc-override-dll.vcxproj.filters
+++ b/ide/vs2022/mimalloc-override-dll.vcxproj.filters
@ -49,16 +49,19 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\stats.c">
      <Filter>Sources</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandon.c">
+    <ClCompile Include="..\..\src\page-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-meta.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\windbg\mimalloc_dbg.cpp">
      <Filter>Sources</Filter>
    </ClCompile>
  </ItemGroup>
@ -75,12 +78,6 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
      <Filter>Headers</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\include\mimalloc-etw.h">
-      <Filter>Headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\include\mimalloc-etw-gen.h">
-      <Filter>Headers</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
      <Filter>Headers</Filter>
    </ClInclude>
@ -96,6 +93,9 @@
    <ClInclude Include="..\..\include\mimalloc\prim.h">
      <Filter>Headers</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\bits.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\include\mimalloc-stats.h">
      <Filter>Headers</Filter>
    </ClInclude>
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 192   // major + 2 digits minor
+#define MI_MALLOC_VERSION 302   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -274,17 +274,17 @@ mi_decl_export int   mi_reserve_huge_os_pages_interleave(size_t pages, size_t nu
 mi_decl_export int   mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;

 mi_decl_export int   mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;

 mi_decl_export void  mi_debug_show_arenas(void) mi_attr_noexcept;
 mi_decl_export void  mi_arenas_print(void) mi_attr_noexcept;

 // Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
+typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;

 #if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
@ -323,6 +323,23 @@ mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexce



+// experimental
+//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size);
+//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size);
+//mi_decl_export void  mi_os_free(void* p, size_t size);
+//mi_decl_export void  mi_os_commit(void* p, size_t size);
+//mi_decl_export void  mi_os_decommit(void* p, size_t size);
+
+mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
+mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id);
+mi_decl_export bool  mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);
+mi_decl_export void  mi_heap_unload(mi_heap_t* heap);
+
+// Is a pointer contained in the given arena area?
+mi_decl_export bool  mi_arena_contains(mi_arena_id_t arena_id, const void* p);
+
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -370,12 +387,11 @@ typedef enum mi_option_e {
  mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
  mi_option_max_errors,                 // issue at most N error messages
  mi_option_max_warnings,               // issue at most N warning messages
-  mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_deprecated_max_segment_reclaim,  // max. percentage of the abandoned segments can be reclaimed per try (=10%)
  mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
  mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
-  mi_option_purge_extend_delay,
-  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_deprecated_purge_extend_delay,
  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
  mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
  mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
@ -384,8 +400,14 @@ typedef enum mi_option_e {
  mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
-  mi_option_target_segments_per_thread, // experimental (=0)
  mi_option_generic_collect,            // collect heaps every N (=10000) generic allocation calls
+  mi_option_page_reclaim_on_free,       // reclaim abandoned pages on a free (=0). -1 disallowr always, 0 allows if the page originated from the current heap, 1 allow always
+  mi_option_page_full_retain,           // retain N full (small) pages per size class (=2)
+  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
+  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
+  mi_option_page_commit_on_demand,      // commit page memory on-demand
+  mi_option_page_reclaim_max,           // don't reclaim pages if we already own N pages (in that size class) (=16)
  _mi_option_last,
  // legacy option names
  mi_option_large_os_pages = mi_option_allow_large_os_pages,
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
+#define MI_ATOMIC_H

 // include windows.h or pthreads.h
 #if defined(_WIN32)
@ -75,16 +75,21 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))

 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))

 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
@ -406,10 +411,9 @@ static inline void mi_atomic_yield(void) {


 // ----------------------------------------------------------------------
-// Locks 
-// These do not have to be recursive and should be light-weight 
-// in-process only locks. Only used for reserving arena's and to 
-// maintain the abandoned list.
+// Locks
+// These should be light-weight in-process only locks.
+// Only used for reserving arena's and to maintain the abandoned list.
 // ----------------------------------------------------------------------
 #if _MSC_VER
 #pragma warning(disable:26110)  // unlock with holding lock
@ -535,4 +539,4 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 #endif


-#endif // __MIMALLOC_ATOMIC_H
+#endif // MI_ATOMIC_H
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
+---------------------------------------------------------------------------- */
+
+#pragma once
+#ifndef MI_BITS_H
+#define MI_BITS_H
+
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if (INTPTR_MAX) > LONG_MAX
+# define MI_PU(x)  x##ULL
+#else
+# define MI_PU(x)  x##UL
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits in size
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+#else
+# define MI_ZU(x)  x##UL
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+/* --------------------------------------------------------------------------------
+  Architecture
+-------------------------------------------------------------------------------- */
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)  // consider arm64ec as arm64
+#define MI_ARCH_ARM64     1
+#elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define MI_ARCH_X64       1
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
+#define MI_ARCH_X86       1
+#elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
+#define MI_ARCH_ARM32     1
+#elif defined(__riscv) || defined(_M_RISCV)
+#define MI_ARCH_RISCV     1
+#if (LONG_MAX == INT32_MAX)
+#define MI_ARCH_RISCV32   1
+#else
+#define MI_ARCH_RISCV64   1
+#endif
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__)
+#include <immintrin.h>
+#elif MI_ARCH_ARM64 && MI_OPT_SIMD
+#include <arm_neon.h>
+#endif
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+#include <intrin.h>
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__) && !defined(__BMI2__) // msvc
+#define __BMI2__  1
+#endif
+#if MI_ARCH_X64 && (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
+#define __BMI1__  1
+#endif
+
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+// maximum virtual address bits in a user-space pointer
+#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
+#define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#elif   MI_ARCH_X64
+#define MI_MAX_VABITS     (47)
+#elif MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#else
+#define MI_MAX_VABITS     (32)
+#endif
+
+// use a flat page-map (or a 2-level one)
+#ifndef MI_PAGE_MAP_FLAT
+#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) 
+#define MI_PAGE_MAP_FLAT  1
+#else
+#define MI_PAGE_MAP_FLAT  0
+#endif
+#endif
+
+
+/* --------------------------------------------------------------------------------
+  Builtin's
+-------------------------------------------------------------------------------- */
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+#define mi_builtin(name)        __builtin_##name
+#define mi_has_builtin(name)    __has_builtin(__builtin_##name)
+
+#if (LONG_MAX == INT32_MAX)
+#define mi_builtin32(name)       mi_builtin(name##l)
+#define mi_has_builtin32(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin32(name)       mi_builtin(name)
+#define mi_has_builtin32(name)   mi_has_builtin(name)
+#endif
+#if (LONG_MAX == INT64_MAX)
+#define mi_builtin64(name)       mi_builtin(name##l)
+#define mi_has_builtin64(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin64(name)       mi_builtin(name##ll)
+#define mi_has_builtin64(name)   mi_has_builtin(name##ll)
+#endif
+
+#if (MI_SIZE_BITS == 32)
+#define mi_builtinz(name)        mi_builtin32(name)
+#define mi_has_builtinz(name)    mi_has_builtin32(name)
+#define mi_msc_builtinz(name)    name
+#elif (MI_SIZE_BITS == 64)
+#define mi_builtinz(name)        mi_builtin64(name)
+#define mi_has_builtinz(name)    mi_has_builtin64(name)
+#define mi_msc_builtinz(name)    name##64
+#endif
+
+/* --------------------------------------------------------------------------------
+  Popcount and count trailing/leading zero's
+-------------------------------------------------------------------------------- */
+
+size_t _mi_popcount_generic(size_t x);
+
+static inline size_t mi_popcount(size_t x) {
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
+
+size_t _mi_clz_generic(size_t x);
+size_t _mi_ctz_generic(size_t x);
+
+static inline size_t mi_ctz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(__GNUC__) && MI_ARCH_X64
+    // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell)
+    // if the argument is zero:
+    // - tzcnt: sets carry-flag, and returns MI_SIZE_BITS
+    // - bsf  : sets zero-flag, and leaves the destination _unmodified_ (on both AMD and Intel now, see <https://github.com/llvm/llvm-project/pull/102885>)
+    // so we always initialize r to MI_SIZE_BITS to work correctly on all cpu's without branching
+    size_t r = MI_SIZE_BITS;
+    __asm ("tzcnt\t%1, %0" : "+r"(r) : "r"(x) : "cc");  // use '+r' to keep the assignment to r in case this becomes bsf on older cpu's
+    return r;
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
+    return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS);  // ensure it still works on non-BMI1 cpu's as well
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && MI_ARCH_X86
+    size_t r = MI_SIZE_BITS;
+    __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc");
+    return r;
+  #elif MI_HAS_FAST_POPCOUNT
+    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+static inline size_t mi_clz(size_t x) {
+  // we don't optimize anymore to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016)
+  // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position)
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+    size_t r;
+    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return (MI_SIZE_BITS - 1 - r);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+#ifndef MI_HAS_FAST_BITSCAN
+#define MI_HAS_FAST_BITSCAN 1
+#endif
+
+/* --------------------------------------------------------------------------------
+  find trailing/leading zero  (bit scan forward/reverse)
+-------------------------------------------------------------------------------- */
+
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf(size_t x, size_t* idx) {
+  // we don't optimize anymore to lzcnt so we run correctly on older cpu's as well
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
+    return !is_zero;
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
+  #endif
+}
+
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr(size_t x, size_t* idx) {
+  #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
+  #endif
+}
+
+
+/* --------------------------------------------------------------------------------
+  rotate
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_rotr(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateright64)(x,r);
+  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateright32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotr64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotr(x,(int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline size_t mi_rotl(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateleft64)(x,r);
+  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotl64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
+
+#endif // MI_BITS_H
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_PRIM_H
-#define MIMALLOC_PRIM_H
+#ifndef MI_PRIM_H
+#define MI_PRIM_H


 // --------------------------------------------------------------------------
@ -117,7 +117,8 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);

-
+// Is this thread part of a thread pool?
+bool _mi_prim_thread_is_in_threadpool(void);



@ -269,35 +270,42 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce


 // defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  const mi_threadid_t tid = __mi_prim_thread_id();
+  mi_assert_internal(tid > 1);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
+  return tid;
+}

 // Get a unique id for the current thread.
 #if defined(MI_PRIM_THREAD_ID)

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
 }

 #elif defined(_WIN32)

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
 }

 #elif MI_USE_BUILTIN_THREAD_POINTER

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  // Works on most Unix based platforms with recent compilers
  return (uintptr_t)__builtin_thread_pointer();
 }

 #elif MI_HAS_TLS_SLOT

-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  #if defined(__BIONIC__)
    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
@ -313,7 +321,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 #else

 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
  return (uintptr_t)&_mi_heap_default;
 }

@ -416,4 +424,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()


-#endif  // MIMALLOC_PRIM_H
+#endif  // MI_PRIM_H
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TRACK_H
-#define MIMALLOC_TRACK_H
+#ifndef MI_TRACK_H
+#define MI_TRACK_H

 /* ------------------------------------------------------------------------------------------------------
 Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
@ -142,4 +142,4 @@ defined, undefined, or not accessible at all:
  }
 #endif

-#endif
+#endif // MI_TRACK_H
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -5,17 +5,15 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TYPES_H
-#define MIMALLOC_TYPES_H
+#ifndef MI_TYPES_H
+#define MI_TYPES_H

 // --------------------------------------------------------------------------
 // This file contains the main type definitions for mimalloc:
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
-// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
-//                  where objects are allocated.
+//                  where objects of a single size are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
@ -24,11 +22,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <mimalloc-stats.h>
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "atomic.h"   // _Atomic
-
-#ifdef _MSC_VER
-#pragma warning(disable:4214) // bitfield is not int
-#endif
+#include <errno.h>    // error codes
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
+#include "atomic.h"   // _Atomic primitives

 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `sizeof(void*)`
@ -51,11 +47,17 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1

-// Define MI_SECURE to enable security mitigations
-// #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 2  // guard page around each mimalloc page
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
+// but protects most metadata with guard pages:
+//   #define MI_SECURE 1  // guard page around metadata
+//
+// Level 2 has more performance impact but protect well against various buffer overflows
+// by surrounding all mimalloc pages with guard pages:
+//   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large heaps..)
+//
+// The next two levels can have more performance cost:
+//   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//   #define MI_SECURE 4  // checks for double free. (may be more expensive)

 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@ -98,126 +100,132 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif

+// Enable large pages for objects between 64KiB and 512KiB.
+// Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages.
+#ifndef MI_ENABLE_LARGE_PAGES
+#define MI_ENABLE_LARGE_PAGES  0
+#endif

-// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
-// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
-// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
-// the owning thread).
-// #define MI_HUGE_PAGE_ABANDON 1
+// --------------------------------------------------------------
+// Sizes of internal data-structures
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
+// --------------------------------------------------------------

-
-// ------------------------------------------------------
-// Platform specific values
-// ------------------------------------------------------
-
-// ------------------------------------------------------
-// Size of a pointer.
-// We assume that `sizeof(void*)==sizeof(intptr_t)`
-// and it holds for all platforms we know of.
-//
-// However, the C standard only requires that:
-//  p == (void*)((intptr_t)p))
-// but we also need:
-//  i == (intptr_t)((void*)i)
-// or otherwise one might define an intptr_t type that is larger than a pointer...
-// ------------------------------------------------------
-
-#if INTPTR_MAX > INT64_MAX
-# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
-#elif INTPTR_MAX == INT64_MAX
-# define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == INT32_MAX
-# define MI_INTPTR_SHIFT (2)
+// Sizes are for 64-bit
+#ifndef MI_ARENA_SLICE_SHIFT
+#ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
+#define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
-#error platform pointers must be 32, 64, or 128 bits
+#define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
+#endif
+#endif
+#ifndef MI_BCHUNK_BITS_SHIFT
+#define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif

-#if SIZE_MAX == UINT64_MAX
-# define MI_SIZE_SHIFT (3)
-typedef int64_t  mi_ssize_t;
-#elif SIZE_MAX == UINT32_MAX
-# define MI_SIZE_SHIFT (2)
-typedef int32_t  mi_ssize_t;
-#else
-#error platform objects must be 32 or 64 bits
-#endif
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps are "bchunks" of 512 bits
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)

-#if (SIZE_MAX/2) > LONG_MAX
-# define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
-#else
-# define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
-#endif
+#define MI_ARENA_MIN_OBJ_SLICES           (1)
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_BCHUNK_BITS)            // 32 MiB (for now, cannot cross chunk boundaries)

-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)

-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)

-#define MI_KiB     (MI_ZU(1024))
-#define MI_MiB     (MI_KiB*MI_KiB)
-#define MI_GiB     (MI_MiB*MI_KiB)
-
-
-// ------------------------------------------------------
-// Main internal data-structures
-// ------------------------------------------------------
-
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit, divide by two for 32-bit
-#ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
-#endif
-#ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
-#endif
-#ifndef MI_LARGE_PAGE_SHIFT
-#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#endif
-#ifndef MI_SEGMENT_SHIFT
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
-#endif
-
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
-
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
-
-#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
-#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
-#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
-
-// The max object size are checked to not waste more than 12.5% internally over the page sizes.
-// (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/8)   // 8 KiB
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/8)  // 64 KiB
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/4)   // 1 MiB
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)

 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #if MI_BIN_HUGE != 73U
 #error "mimalloc internal: expecting 73 bins"
 #endif
-
-#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
-
-// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
-
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+#define MI_BIN_COUNT (MI_BIN_FULL+1)

 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+// Minimal commit for a page on-demand commit (should be >= OS page size)
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE // (4*MI_KiB)
+
+// ------------------------------------------------------
+// Arena's are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_ARENA_SLICE_SIZE slices of memory for the
+// mimalloc pages.
+// ------------------------------------------------------
+
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;     // defined in `arena.c`
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
+// The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main heap data for example (`init.c`))
+  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case) (`arena.c`)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
+  return (memkind <= MI_MEM_STATIC);
+}
+
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  mi_arena_t*   arena;              // arena that contains this memory
+  uint32_t      slice_index;        // slice index in the arena
+  uint32_t      slice_count;        // allocated slices
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_meta_info {
+  void*         meta_page;          // meta-page that contains the block
+  uint32_t      block_index;        // block index in the meta-data page
+  uint32_t      block_count;        // allocated blocks
+} mi_memid_meta_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
+  } mem;
+  mi_memkind_t  memkind;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+} mi_memid_t;
+
+
+static inline bool mi_memid_is_os(mi_memid_t memid) {
+  return mi_memkind_is_os(memid.memkind);
+}
+
+static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
+  return mi_memkind_needs_no_free(memid.memkind);
+}

 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@ -235,48 +243,30 @@ typedef struct mi_block_s {
  mi_encoded_t next;
 } mi_block_t;

-#if MI_GUARDED
-// we always align guarded pointers in a block at an offset
-// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
-#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
-#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
-#endif

+// The page flags are put in the bottom 2 bits of the thread_id (for a fast test in `mi_free`)
+// `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing)
+// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
+#define MI_PAGE_IN_FULL_QUEUE         MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED           MI_ZU(0x02)
+#define MI_PAGE_FLAG_MASK             MI_ZU(0x03)
+typedef size_t mi_page_flags_t;

-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abandoned pages without a owning heap; this only resets on page reclaim
-} mi_delayed_t;
-
-
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint32_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
+// There are two special threadid's: 0 for abandoned threads, and 4 for abandoned & mapped threads --
+// abandoned-mapped pages are abandoned but also mapped in an arena so can be quickly found for reuse.
+#define MI_THREADID_ABANDONED         MI_ZU(0)
+#define MI_THREADID_ABANDONED_MAPPED  (MI_PAGE_FLAG_MASK + 1)

 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// Points to a list of blocks that are freed by other threads.
+// The least-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// Ownership is required before we can read any non-atomic fields in the page.
+// This way we can push a block on the thread free list and try to claim ownership atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;

+// A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
+typedef uint8_t mi_heaptag_t;
+
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
 // `free` for blocks that can be allocated,
@ -285,169 +275,109 @@ typedef uintptr_t mi_thread_free_t;
 // The `local_free` and `thread_free` lists are migrated to the `free` list
 // when it is exhausted. The separate `local_free` list is necessary to
 // implement a monotonic heartbeat. The `thread_free` list is needed for
-// avoiding atomic operations in the common case.
+// avoiding atomic operations when allocating from the owning thread.
 //
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
-// We don't count `freed` (as |free|) but use `used` to reduce
+// We don't count "freed" (as |free|) but use only the `used` field to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
+// Use `_mi_page_free_collect` to collect the thread_free list and update the `used` count.
 //
 // Notes:
-// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Non-atomic fields can only be accessed if having _ownership_ (low bit of `xthread_free` is 1).
+//   Combining the `thread_free` list with an ownership bit allows a concurrent `free` to atomically
+//   free an object and (re)claim ownership if the page was abandoned.
+// - If a page is not part of a heap it is called "abandoned"  (`heap==NULL`) -- in
+//   that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that
+//   are in the abandoned page lists of an arena, these are called "mapped" abandoned pages).
+// - page flags are in the bottom 3 bits of `xthread_id` for the fast path in `mi_free`.
+// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 12 words on 32-bit, and encoded free lists add 2 words)
-// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
-//   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
-//   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to
-//   the owning heap `thread_delayed_free` list. This guarantees that pages
-//   will be freed correctly even if only other threads free blocks.
+
 typedef struct mi_page_s {
-  // "owned" by the segment
-  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
-  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
-  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `heap->thread_id (or 0 or 4 if abandoned) | page_flags`)

-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
-  uint16_t              reserved;          // number of blocks reserved in memory
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t                   retire_expire;     // expiration count for retired blocks

-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
-                                           // padding
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the page area containing the blocks
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads (= `mi_block_t* | (1 if owned)`)

+  size_t                    block_size;        // size available in each block (always `>0`)
+  uint8_t*                  page_start;        // start of the blocks
+  mi_heaptag_t              heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+                                               // padding
  #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
  #endif

-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
-
-  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-
-  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
-  void* padding[1];
-  #endif
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
+  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
+  mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;


+// ------------------------------------------------------
+// Object sizes
+// ------------------------------------------------------
+
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE    // minimal block alignment for the first block in a page (16b)
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    MI_KiB               // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*32)  // 160    >= sizeof(mi_page_t)
+#else
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*32)  // 128/96 >= sizeof(mi_page_t)
+#endif
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < ~8 KiB
+#if MI_ENABLE_LARGE_PAGES
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)  // < 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#else
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/4)   // <= 128 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#endif
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
+
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+

 // ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
+// Page kinds
 // ------------------------------------------------------

 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an alignment `> MI_BLOCK_ALIGNMENT_MAX`.
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages (if `MI_ENABLE_LARGE_PAGES==1`)
+  MI_PAGE_SINGLETON // page containing a single block.
+                    // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
 } mi_page_kind_t;


-// ---------------------------------------------------------------
-// a memory id tracks the provenance of arena/OS allocated memory
-// ---------------------------------------------------------------
-
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
-typedef enum mi_memkind_e {
-  MI_MEM_NONE,      // not allocated
-  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
-} mi_memkind_t;
-
-static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
-  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
-}
-
-typedef struct mi_memid_os_info {
-  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        size;               // full allocation size
-} mi_memid_os_info_t;
-
-typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
-} mi_memid_arena_info_t;
-
-typedef struct mi_memid_s {
-  union {
-    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
-  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
-  bool          initially_committed;// `true` if the memory was originally allocated as committed
-  bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
-} mi_memid_t;
-
-
-// ---------------------------------------------------------------
-// Segments contain mimalloc pages
-// ---------------------------------------------------------------
-typedef struct mi_subproc_s mi_subproc_t;
-
-// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
-// Inside segments we allocated fixed size _pages_ that contain blocks.
-typedef struct mi_segment_s {
-  // constant fields
-  mi_memid_t           memid;            // memory id to track provenance
-  bool                 allow_decommit;
-  bool                 allow_purge;
-  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  mi_subproc_t*        subproc;          // segment belongs to sub process
-
-  // segment fields
-  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
-  struct mi_segment_s* prev;
-  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
-  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
-
-  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
-
-  size_t               used;             // count of pages in use (`used <= capacity`)
-  size_t               capacity;         // count of available pages (`#free + used`)
-  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
-
-  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
-  struct mi_segment_s* abandoned_os_prev;
-
-  // layout like this to optimize access in `mi_free`
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
-  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
-} mi_segment_t;
-

 // ------------------------------------------------------
 // Heaps
+//
 // Provide first-class heaps to allocate from.
 // A heap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
+//
 // Per thread, there is always a default heap that is
 // used for allocation; it is initialized to statically
 // point to an empty heap to avoid initialization checks
@ -461,11 +391,10 @@ typedef struct mi_tld_s mi_tld_t;
 typedef struct mi_page_queue_s {
  mi_page_t* first;
  mi_page_t* last;
+  size_t     count;
  size_t     block_size;
 } mi_page_queue_t;

-#define MI_BIN_FULL  (MI_BIN_HUGE+1)
-
 // Random context
 typedef struct mi_random_cxt_s {
  uint32_t input[16];
@ -476,7 +405,7 @@ typedef struct mi_random_cxt_s {


 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@ -493,12 +422,9 @@ typedef struct mi_padding_s {

 // A heap owns a set of pages.
 struct mi_heap_s {
-  mi_tld_t*             tld;
-  _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  mi_tld_t*             tld;                                 // thread-local data
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
  mi_random_ctx_t       random;                              // random number context used for secure allocation
  size_t                page_count;                          // total number of pages in the `pages` queues.
  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
@ -506,7 +432,9 @@ struct mi_heap_s {
  long                  generic_count;                       // how often is `_mi_malloc_generic` called?
  long                  generic_collect_count;               // how often is `_mi_malloc_generic` called without collecting?
  mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  long                  page_full_retain;                    // how many full pages can be retained per queue (before abondoning them)
+  bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
+  bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
  #if MI_GUARDED
  size_t                guarded_size_min;                    // minimal size for guarded objects
@ -516,7 +444,8 @@ struct mi_heap_s {
  size_t                guarded_sample_count;                // current sample count (counting down to 0)
  #endif
  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the heap struct itself (meta or os)
 };


@ -526,15 +455,23 @@ struct mi_heap_s {
 // static variables of a process.
 // ------------------------------------------------------

-struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
-  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
-  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
-  mi_memid_t         memid;                   // provenance of this memory block
-};
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
+
+typedef struct mi_subproc_s {
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  _Atomic(int64_t)      purge_expire;                   // expiration is set if any arenas can be purged
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+  mi_page_t*            os_abandoned_pages;             // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
+
+  mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
+} mi_subproc_t;
+


 // ------------------------------------------------------
@ -544,37 +481,44 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;

-// Queue of segments
-typedef struct mi_segment_queue_s {
-  mi_segment_t* first;
-  mi_segment_t* last;
-} mi_segment_queue_t;
-
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_segment_queue_t  small_free;   // queue of segments with free small pages
-  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
-  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  size_t              reclaim_count;// number of reclaimed (abandoned) segments
-  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
-  mi_stats_t*         stats;        // points to tld stats
-} mi_segments_tld_t;
-
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
-  mi_stats_t          stats;         // statistics
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
+  unsigned long long    heartbeat;            // monotonic heartbeat count
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_stats_t            stats;                // statistics
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };


+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+

 // ------------------------------------------------------
 // Debug
@ -622,25 +566,61 @@ void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line
 #endif
 #endif

+
 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
+// adjust stat in special cases to compensate for double counting (and does not adjust peak values and can decrease the total)
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);

 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
 #endif

-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amnt)      __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt)      __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt)

+#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+#define mi_tld_stat_adjust_increase(tld,stat,amnt)              __mi_stat_adjust_increase( &(tld)->stats.stat, amnt)
+#define mi_tld_stat_adjust_decrease(tld,stat,amnt)              __mi_stat_adjust_decrease( &(tld)->stats.stat, amnt)

-#endif
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase(heap->tld, stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( heap->tld, stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( heap->tld, stat, amount)
+
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_stat_decrease( (heap)->tld->stats.stat, amount)
+
+#endif // MI_TYPES_H
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -16,21 +16,22 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------

 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size
  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
  if (alignment > size) return false;
-  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
  const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize));
+  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
+  return ok;
 }

 #if MI_GUARDED
 static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
  // use over allocation for guarded blocksl
-  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
  const size_t oversize = size + alignment - 1;
  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
-  void* p = mi_align_up_ptr(base, alignment);
+  void* p = _mi_align_up_ptr(base, alignment);
  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
  mi_assert_internal(mi_usable_size(p) >= size);
  mi_assert_internal(_mi_is_aligned(p, alignment));
@ -59,21 +60,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t

  void* p;
  size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
-    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
    if mi_unlikely(offset != 0) {
      // todo: cannot support offset alignment for very large alignments yet
-#if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
-#endif
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
      return NULL;
    }
    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
    // note: no guarded as alignment > 0
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
-    // zero afterwards as only the area from the aligned_p may be committed!
+    p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block
    if (p == NULL) return NULL;
  }
  else {
@ -114,13 +114,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  #endif

  // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
-  }
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}

  if (p != aligned_p) {
    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
@ -177,12 +177,14 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  }

  #if MI_GUARDED
-  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_heap_malloc_use_guarded(heap,size)) {
    return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
  }
  #endif

  // try first if there happens to be a small block available with just the right alignment
+  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
+  // naturally aligned this can be often the case.
  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
    const size_t padsize = size + MI_PADDING_SIZE;
--- a/src/alloc.c
+++ b/src/alloc.c
@ -30,7 +30,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+  if (page->block_size != 0) { // not the empty heap
+    mi_assert_internal(mi_page_block_size(page) >= size);
+    mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+    mi_assert_internal(_mi_ptr_page(page)==page);
+  }

  // check the free list
  mi_block_t* const block = page->free;
@ -82,7 +86,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_

  #if (MI_STAT>0)
  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_increase(heap, malloc_normal, bsize);
    mi_heap_stat_counter_increase(heap, malloc_normal_count, 1);
    #if (MI_STAT>1)
@ -130,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
  mi_assert(size <= MI_SMALL_SIZE_MAX);
  #if MI_DEBUG
  const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local
  #endif
  #if (MI_PADDING || MI_GUARDED)
  if (size == 0) { size = sizeof(void*); }
@ -184,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
  else {
    // regular allocation
    mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id());   // heaps are thread local
    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
    mi_track_malloc(p,size,zero);

@ -268,7 +272,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
  // if p == NULL then behave as malloc.
  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  const size_t size = (p==NULL ? 0 : _mi_usable_size(p,"mi_realloc"));
  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
    mi_assert_internal(p!=NULL);
    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
@ -615,7 +619,6 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  block->next = MI_BLOCK_TAG_GUARDED;

  // set guard page at the end of the block
-  mi_segment_t* const segment = _mi_page_segment(page);
  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
  const size_t os_page_size = _mi_os_page_size();
  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
@ -625,8 +628,11 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
    return NULL;
  }
  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
+  // is ensured in `mi_arena_page_alloc_fresh`.
+  mi_assert_internal(_mi_is_aligned(block, os_page_size));
  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
-  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
    _mi_os_protect(guard_page, os_page_size);
  }
  else {
@ -636,11 +642,11 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
  // align pointer just in front of the guard page
  size_t offset = block_size - os_page_size - obj_size;
  mi_assert_internal(offset > sizeof(mi_block_t));
-  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
    // give up to place it right in front of the guard page if the offset is too large for unalignment
-    offset = MI_BLOCK_ALIGNMENT_MAX;
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
  }
-  void* p = (uint8_t*)block + offset;  
+  void* p = (uint8_t*)block + offset;
  mi_track_align(block, p, offset, obj_size);
  mi_track_mem_defined(block, sizeof(mi_block_t));
  return p;
@ -659,16 +665,16 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
  if (block==NULL) return NULL;
-  void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);

  // stats
-  mi_track_malloc(p, size, zero);  
+  mi_track_malloc(p, size, zero);
  if (p != NULL) {
    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
    #if MI_STAT>1
    mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
    #endif
-    _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1);
+    mi_heap_stat_counter_increase(heap, malloc_guarded_count, 1);
  }
  #if MI_DEBUG>3
  if (p != NULL && zero) {
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@ -1,346 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
-
-#if !defined(MI_IN_ARENA_C)
-#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
-// add includes help an IDE
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-#endif
-
-// Minimal exports for arena-abandoned.
-size_t      mi_arena_id_index(mi_arena_id_t id);
-mi_arena_t* mi_arena_from_index(size_t idx);
-size_t      mi_arena_get_count(void);
-void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
-bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
-
-/* -----------------------------------------------------------
-  Abandoned blocks/segments:
-
-  _mi_arena_segment_clear_abandoned
-  _mi_arena_segment_mark_abandoned
-
-  This is used to atomically abandon/reclaim segments
-  (and crosses the arena API but it is convenient to have here).
-
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment.
-
-  Abandoned segments are atomically marked in the `block_abandoned`
-  bitmap of arenas. Any segments allocated outside arenas are put
-  in the sub-process `abandoned_os_list`. This list is accessed
-  using locks but this should be uncommon and generally uncontended.
-  Reclaim and visiting either scan through the `block_abandoned`
-  bitmaps of the arena's, or visit the `abandoned_os_list`
-
-  A potentially nicer design is to use arena's for everything
-  and perhaps have virtual arena's to map OS allocated memory
-  but this would lack the "density" of our current arena's. TBC.
----------------------------------------------------------- */
-
-
-// reclaim a specific OS abandoned segment; `true` on success.
-// sets the thread_id.
-static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena, remove from list of abandoned os segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
-    return false;  // failed to acquire the lock, we just give up
-  }
-  // remove atomically from the abandoned os list (if possible!)
-  bool reclaimed = false;
-  mi_segment_t* const next = segment->abandoned_os_next;
-  mi_segment_t* const prev = segment->abandoned_os_prev;
-  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
-    #if MI_DEBUG>3
-    // find ourselves in the abandoned list (and check the count)
-    bool found = false;
-    size_t count = 0;
-    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
-      if (current == segment) { found = true; }
-      count++;
-    }
-    mi_assert_internal(found);
-    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
-    #endif
-    // remove (atomically) from the list and reclaim
-    if (prev != NULL) { prev->abandoned_os_next = next; }
-    else { subproc->abandoned_os_list = next; }
-    if (next != NULL) { next->abandoned_os_prev = prev; }
-    else { subproc->abandoned_os_list_tail = prev; }
-    segment->abandoned_os_next = NULL;
-    segment->abandoned_os_prev = NULL;
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
-    if (take_lock) { // don't reset the thread_id when iterating
-      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-    }
-    reclaimed = true;
-  }
-  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
-  return reclaimed;
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // reclaim atomically
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) {
-    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-
-// mark a specific OS segment as abandoned
-static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena; we use a list of abandoned segments
-  mi_subproc_t* const subproc = segment->subproc;
-  mi_lock(&subproc->abandoned_os_lock) {
-    // push on the tail of the list (important for the visitor)
-    mi_segment_t* prev = subproc->abandoned_os_list_tail;
-    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
-    mi_assert_internal(segment->abandoned_os_prev == NULL);
-    mi_assert_internal(segment->abandoned_os_next == NULL);
-    if (prev != NULL) { prev->abandoned_os_next = segment; }
-    else { subproc->abandoned_os_list = segment; }
-    subproc->abandoned_os_list_tail = segment;
-    segment->abandoned_os_prev = prev;
-    segment->abandoned_os_next = NULL;
-    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count);
-    // and release the lock
-  }
-  return;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
-{
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    mi_arena_segment_os_mark_abandoned(segment);
-    return;
-  }
-  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // set abandonment atomically
-  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-
-/* -----------------------------------------------------------
-  Iterate through the abandoned blocks/segments using a cursor.
-  This is used for reclaiming and abandoned block visiting.
----------------------------------------------------------- */
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  current->bitmap_idx = 0;
-  current->subproc = subproc;
-  current->visit_all = visit_all;
-  current->hold_visit_lock = false;
-  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
-  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
-  const size_t max_arena = mi_arena_get_count();
-  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
-    // for a heap that is bound to one arena, only visit that arena
-    current->start = mi_arena_id_index(heap->arena_id);
-    current->end = current->start + 1;
-    current->os_list_count = 0;
-  }
-  else {
-    // otherwise visit all starting at a random location
-    if (abandoned_count > abandoned_list_count && max_arena > 0) {
-      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
-      current->end = current->start + max_arena;
-    }
-    else {
-      current->start = 0;
-      current->end = 0;
-    }
-    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
-  }
-  mi_assert_internal(current->start <= max_arena);
-}
-
-void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
-  if (current->hold_visit_lock) {
-    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
-    current->hold_visit_lock = false;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
-  // try to reclaim an abandoned segment in the arena atomically
-  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  // check that the segment belongs to our sub-process
-  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
-  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
-  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
-  if (segment->subproc != subproc) {
-    // it is from another sub-process, re-mark it and continue searching
-    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
-    return NULL;
-  }
-  else {
-    // success, we unabandoned a segment in our sub-process
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    return segment;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
-  const size_t max_arena = mi_arena_get_count();
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
-  // visit arena's (from the previous cursor)
-  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
-    // index wraps around
-    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
-    mi_arena_t* arena = mi_arena_from_index(arena_idx);
-    if (arena != NULL) {
-      bool has_lock = false;
-      // visit the abandoned fields (starting at previous_idx)
-      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // we only take the arena lock if there are actually abandoned segments present
-          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
-            if (!has_lock) {
-              if (previous->visit_all) {
-                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
-              }
-              // skip to next arena
-              break;
-            }
-          }
-          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
-              if (segment != NULL) {
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
-                return segment;
-              }
-            }
-          }
-        }
-      }
-      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-    }
-  }
-  return NULL;
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
-  // go through the abandoned_os_list
-  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
-  // The lock is released when the cursor is released.
-  if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
-                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
-    if (!previous->hold_visit_lock) {
-      if (previous->visit_all) {
-        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
-      }
-      return NULL; // we cannot get the lock, give up
-    }
-  }
-  // One list entry at a time
-  while (previous->os_list_count > 0) {
-    previous->os_list_count--;
-    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    mi_segment_t* segment = previous->subproc->abandoned_os_list;
-    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-      mi_lock_release(&previous->subproc->abandoned_os_lock);
-      return segment;
-    }
-    // already abandoned, try again
-    mi_lock_release(&previous->subproc->abandoned_os_lock);
-  }
-  // done
-  mi_assert_internal(previous->os_list_count == 0);
-  return NULL;
-}
-
-
-// reclaim abandoned segments
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
-  if (previous->start < previous->end) {
-    // walk the arena
-    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
-    if (segment != NULL) { return segment; }
-  }
-  // no entries in the arena's anymore, walk the abandoned OS list
-  mi_assert_internal(previous->start == previous->end);
-  return mi_arena_segment_clear_abandoned_next_list(previous);
-}
-
-
-bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  // (unfortunately) the visit_abandoned option must be enabled from the start.
-  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
-  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
-    return false;
-  }
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
-  mi_segment_t* segment;
-  bool ok = true;
-  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
-    _mi_arena_segment_mark_abandoned(segment);
-  }
-  _mi_arena_field_cursor_done(&current);
-  return ok;
-}
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  We have a special "mini" allocator just for allocation of meta-data like
+  the heap (`mi_heap_t`) or thread-local data (`mi_tld_t`).
+
+  We reuse the bitmap of the arena's for allocation of 64b blocks inside
+  an arena slice (64KiB).
+  We always ensure that meta data is zero'd (we zero on `free`)
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Meta data allocation
+----------------------------------------------------------- */
+
+#define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
+#define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
+
+#define MI_META_BLOCK_SIZE        (128)                       // large enough such that META_MAX_SIZE >= 4k (even on 32-bit)
+#define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
+#define MI_META_BLOCKS_PER_PAGE   (MI_META_PAGE_SIZE / MI_META_BLOCK_SIZE)  // 512
+#define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
+
+typedef struct mi_meta_page_s  {
+  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
+  mi_memid_t                       memid;   // provenance of the meta-page memory itself
+  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
+} mi_meta_page_t;
+
+static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
+
+
+#if MI_DEBUG > 1
+static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
+  if (block_idx != NULL) {
+    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
+  }
+  return mpage;
+}
+#endif 
+
+static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
+  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
+}
+
+static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
+  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
+  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
+  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
+  return p;
+}
+
+// allocate a fresh meta page and add it to the global list.
+static mi_meta_page_t* mi_meta_page_zalloc(void) {
+  // allocate a fresh arena slice
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
+  mi_memid_t memid;
+  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
+                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
+  if (base == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
+  }
+
+  // guard pages
+  #if MI_SECURE >= 1
+  _mi_os_secure_guard_page_set_at(base, memid.is_pinned);
+  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid.is_pinned);
+  #endif
+  
+  // initialize the page and free block bitmap
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
+  mpage->memid = memid;
+  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
+  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);  
+  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
+
+  // push atomically in front of the meta page list
+  // (note: there is no ABA issue since we never free meta-pages)
+  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  do {
+    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
+  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
+  return mpage;
+}
+
+
+// allocate meta-data
+mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+{
+  mi_assert_internal(pmemid != NULL);
+  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
+  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
+  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+    else {
+      mpage = mi_meta_page_next(mpage);
+    }
+  }
+  // failed to find space in existing pages
+  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
+    // the page list was updated by another thread in the meantime, retry
+    return _mi_meta_zalloc(size,pmemid);
+  }
+  // otherwise, allocate a fresh metapage and try once more
+  mpage = mi_meta_page_zalloc();
+  if (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+  }
+  // if all this failed, allocate from the OS
+  return _mi_os_alloc(size, pmemid);
+}
+
+// free meta-data
+mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (memid.memkind == MI_MEM_META) {
+    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
+    const size_t block_count = memid.mem.meta.block_count;
+    const size_t block_idx   = memid.mem.meta.block_index;
+    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; 
+    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
+    mi_assert_internal(block_idx + block_count <= MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
+    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
+    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
+  }
+  else {
+    _mi_arenas_free(p,size,memid);
+  }
+}
+
+// used for debug output
+bool _mi_meta_is_meta_page(void* p) 
+{
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    if ((void*)mpage == p) return true;
+    mpage = mi_meta_page_next(mpage);    
+  }
+  return false;
+}
--- a/src/arena.c
+++ b/src/arena.c
--- a/src/bitmap.c
+++ b/src/bitmap.c
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -1,110 +1,313 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/

 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H

-/* -----------------------------------------------------------
-  Bitmap definition
----------------------------------------------------------- */
+/* --------------------------------------------------------------------------------
+  Atomic bitmaps with release/acquire guarantees:

-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.

-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.

-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
+      ranges aligned to a bfield.

-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return mi_bitmap_index_create_ex(idx,bitidx);
-}
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).

-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
+      to have no bits set. It is also allowed to briefly have a clear bit even if the
+      chunk has bits set -- as long as we guarantee that the bit will be set later on;
+      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).

-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk. Therefore, when
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
+      then test again to catch any set bits that we may have missed.

-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
+      not find a free page even though it's there (but we accept this as we avoid taking
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
+      for now).

-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
----------------------------------------------------------- */
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
+      The minimum is 1 chunk which is a 32 MiB arena.

-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */

-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+// A word-size bit field.
+typedef size_t mi_bfield_t;

-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
+#define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..

-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit


-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
+} mi_bchunk_t;

-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);

-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
+// The chunkmap is itself a chunk.
+typedef mi_bchunk_t mi_bchunkmap_t;

-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS

+#define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT     (1)
+#if MI_SIZE_BITS > 32
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
+#else
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
 #endif
+#define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
+
+
+// An atomic bitmap
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bitmap_t;
+
+
+static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
+}
+
+static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
+  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap operations
+-------------------------------------------------------------------------------- */
+
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
+typedef bool  mi_xset_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
+
+
+// Required size of a bitmap to represent `bit_count` bits.
+size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
+
+// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+// If `already_set` is not NULL, it is set to count of bits were already all set.
+// (this is used for correct statistics if commiting over a partially committed area)
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
+}
+
+static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_setN(bitmap, idx, 1);
+}
+
+static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_clearN(bitmap, idx, 1);
+}
+
+// Called once a bit is cleared to see if the memory slice can be claimed.
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set);
+
+// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
+// If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag );
+
+
+// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
+// This is used to delay free-ing a page that it at the same time being considered to be
+// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
+
+
+// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for unloading arena's
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
+
+
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
+
+// Visit all set bits in a bitmap (`slice_count == 1`)
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+
+/* ----------------------------------------------------------------------------
+  Binned concurrent bitmap
+  Assigns a size class to each chunk such that small blocks don't cause too
+  much fragmentation since we keep chunks for larger blocks separate.
+---------------------------------------------------------------------------- */
+
+// Size bins; larger bins are allowed to go into smaller bins.
+// SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
+typedef enum mi_bbin_e {
+  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
+  MI_BBIN_SMALL,    // slice_count == 1
+  MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
+  MI_BBIN_MEDIUM,   // slice_count == 8
+  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- only used if MI_ENABLE_LARGE_PAGES is 1
+  MI_BBIN_COUNT
+} mi_bbin_t;
+
+static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
+  return (mi_bbin_t)((int)bbin + 1);
+}
+
+static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
+  if (slice_count==1) return MI_BBIN_SMALL;
+  if (slice_count==8) return MI_BBIN_MEDIUM;
+  #if MI_ENABLE_LARGE_PAGES
+  if (slice_count==MI_BFIELD_BITS) return MI_BBIN_LARGE;
+  #endif
+  return MI_BBIN_OTHER;
+}
+
+// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes
+typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT];        // 512b
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bbitmap_t;
+
+
+static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
+  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
+}
+
+static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
+  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
+}
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
+
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
+// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
+}
+
+// Is a sequence of n bits already clear?
+static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
+}
+
+
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
+bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+// Specialized versions for common bit sequence sizes
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+
+// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
+  // if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
+  if (n==0 || n>MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n<=MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
+  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
+}
+
+
+#endif // MI_BITMAP_H
--- a/src/free.c
+++ b/src/free.c
@ -23,9 +23,6 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // Free
 // ------------------------------------------------------

-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
-
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
@ -50,6 +47,40 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
  }
 }

+// Forward declaration for multi-threaded collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept;
+
+// Free a block multi-threaded
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+#endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always use owned: try to claim it if the page is abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
+
+  // and atomically try to collect the page if it was abandoned
+  const bool is_owned_now = !mi_tf_is_owned(tf_old);
+  if (is_owned_now) {
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_free_try_collect_mt(page,block);
+  }
+}
+
+
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
 // `page_start` and `block_size` fields; however these are constant and the page won't be
@ -57,7 +88,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
  mi_assert_internal(page!=NULL && p!=NULL);

-  size_t diff = (uint8_t*)p - page->page_start;
+  size_t diff = (uint8_t*)p - mi_page_start(page);
  size_t adjust;
  if mi_likely(page->block_size_shift != 0) {
    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@ -81,218 +112,207 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 }
 #endif

+
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  MI_UNUSED(segment);
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
  mi_block_check_unguard(page, block, p);
  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }

 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
+  if (p==NULL) return;  // a NULL pointer is seen as abandoned (tid==0) with a full flag set
+  #if !MI_PAGE_MAP_FLAT
+  if (page==&_mi_page_empty) return;  // an invalid pointer may lead to using the empty page
+  #endif
+  mi_assert_internal(p!=NULL && page != NULL && page != &_mi_page_empty);
  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
  mi_block_check_unguard(page, block, p);
-  mi_free_block_mt(page, segment, block);
+  mi_free_block_mt(page, block);
 }

 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
-           else mi_free_generic_mt(page,segment,p);
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
 }

-// Get the segment data belonging to a pointer
-// This is just a single `and` in release mode but does further checks in debug mode
-// (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
-{
-  MI_UNUSED(msg);

-  #if (MI_DEBUG>0)
+// Get the page belonging to a pointer
+// Does further checks in debug mode to see if this was a valid pointer.
+static inline mi_page_t* mi_validate_ptr_page(const void* p, const char* msg)
+{
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
    return NULL;
  }
-  #endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment==NULL) return segment;
-
-  #if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (page == NULL) {
+    if (p != NULL) {
+      _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
    }
+    #if !MI_PAGE_MAP_FLAT
+    page = (mi_page_t*)&_mi_page_empty;
+    #endif
  }
+  return page;
+  #else
+  return _mi_ptr_page(p);
  #endif
-  #if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-  #endif
-
-  return segment;
 }

 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment==NULL) return;
+  mi_page_t* const page = mi_validate_ptr_page(p,"mi_free");

-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  #if MI_PAGE_MAP_FLAT  // if not flat, p==NULL leads to `_mi_page_empty` which leads to `mi_free_generic_mt`
+  if mi_unlikely(page==NULL) return;
+  #endif
+  mi_assert_internal(page!=NULL);

-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
-    }
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
+  if mi_likely(xtid == 0) {                        // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // thread-local, aligned, and not a full page
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
+  }
+  else if (xtid <= MI_PAGE_FLAG_MASK) {            // `tid == mi_page_thread_id(page) && mi_page_flags(page) != 0`
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
+    mi_free_generic_local(page, p);
+  }
+  // free-ing in a page owned by a heap in another thread, or an abandoned page (not belonging to a heap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {      // `tid != mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // blocks are aligned (and not a full page); push on the thread_free list
+    mi_block_t* const block = (mi_block_t*)p;
+    mi_free_block_mt(page,block);
  }
  else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(page, segment, p);
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
+    mi_free_generic_mt(page, p);
  }
 }

-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  mi_assert_internal(block!=NULL);
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since `used` is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
-  return true;
-}

 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
-
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
-{
-  // Try to put the block on either the page-local thread free list,
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free);
+static inline bool mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) {
+  mi_page_queue_t* const pq = mi_page_queue(heap,block_size);
+  mi_assert_internal(pq!=NULL);
+  return (pq->count <= atmost);
+  /*
+  for(mi_page_t* p = pq->first; p!=NULL; p = p->next, atmost--) {
+    if (atmost == 0) { return false; }
  }
+  return true;
+  */
 }

-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
-{
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE &&
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
-      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+
+  // we own the page now..
+  // safe to collect the thread atomic free list
+  // use the `_partly` version to avoid atomic operations since we already have the `mt_free` pointing into the thread free list
+  _mi_page_free_collect_partly(page, mt_free);
+
+  #if MI_DEBUG > 1
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+  #endif
+
+  // 1. free if the page is free now  (this is updated by `_mi_page_free_collect_partly`)
+  if (mi_page_all_free(page))
  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
+    // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
+    _mi_arenas_page_unabandon(page);
+    // we can free the page directly
+    _mi_arenas_page_free(page);
+    return;
+  }
+
+  // 2. we can try to reclaim the page for ourselves
+  // note:  we only reclaim if the page originated from our heap (the heap field is preserved on abandonment)
+  // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson`
+  if (page->block_size <= MI_SMALL_MAX_OBJ_SIZE)       // only for small sized blocks
+  {
+    const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
+    if (reclaim_on_free >= 0) {                        // and reclaiming is allowed
+      // get our heap (with the right tag)
+      // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
+      // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+      mi_heap_t* heap = mi_prim_get_default_heap();
+      if (heap != page->heap) {
+        if (mi_heap_is_initialized(heap)) {
+          heap = _mi_heap_by_tag(heap, page->heap_tag);
+        }
+      }
+      // can we reclaim into this heap?
+      if (heap != NULL && heap->allow_page_reclaim) {
+        const long reclaim_max = _mi_option_get_fast(mi_option_page_reclaim_max);
+        if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, reclaim_max)) ||  // only reclaim if we were the originating heap, and we have at most N pages already
+          (reclaim_on_free == 1 &&               // OR if the reclaim across heaps is allowed
+            !mi_page_is_used_at_frac(page, 8) &&  //    and the page is not too full
+            !heap->tld->is_in_threadpool &&       //    and not part of a threadpool
+            _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena))  // and the memory is suitable
+          )
+        {
+          // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
+          _mi_arenas_page_unabandon(page);
+          _mi_heap_page_reclaim(heap, page);
+          mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1);
+          return;
+        }
+      }
    }
  }

-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    _mi_segment_huge_page_free(segment, page, block);
+  // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  if (!mi_page_is_used_at_frac(page, 8) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+      !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
+      _mi_arenas_page_try_reabandon_to_mapped(page))
+  {
    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
  }

-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
+
+  // not reclaimed or free'd, unown again
+  // _mi_page_unown(page);
+  mi_page_unown_from_free(page, mt_free);
+}
+
+
+// release ownership of a page. This may free the page if all (other) blocks were concurrently
+// freed in the meantime. Returns true if the page was freed.
+// This is a specialized version of `mi_page_unown` to (try to) avoid calling `mi_page_free_collect` again.
+static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mt_free != NULL);
+  mi_assert_internal(page->used > 1);
+  mi_thread_free_t tf_expect = mi_tf_create(mt_free, true);
+  mi_thread_free_t tf_new    = mi_tf_create(mt_free, false);
+  while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) {
+    mi_assert_internal(mi_tf_is_owned(tf_expect));
+    while (mi_tf_block(tf_expect) != NULL) {
+      _mi_page_free_collect(page,false);  // update used
+      if (mi_page_all_free(page)) {   // it may become free just before unowning it
+        _mi_arenas_page_unabandon(page);
+        _mi_arenas_page_free(page);
+        return true;
+      }
+      tf_expect = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    mi_assert_internal(mi_tf_block(tf_expect)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  }
+  return false;
 }


@ -316,9 +336,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }

 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if mi_unlikely(segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const mi_page_t* const page = mi_validate_ptr_page(p,msg);
+  if mi_unlikely(page==NULL) return 0;
  if mi_likely(!mi_page_has_aligned(page)) {
    const mi_block_t* block = (const mi_block_t*)p;
    return mi_page_usable_size_of(page, block);
@ -513,21 +532,21 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {

 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  #if (MI_STAT < 2)
  MI_UNUSED(block);
-#endif
+  #endif
  mi_heap_t* const heap = mi_heap_get_default();
  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
+  #if (MI_STAT>1)
  const size_t usize = mi_page_usable_size_of(page, block);
  mi_heap_stat_decrease(heap, malloc_requested, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, malloc_normal, bsize);
-#if (MI_STAT > 1)
+    #if (MI_STAT > 1)
    mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], 1);
-#endif
+    #endif
  }
  else {
    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
@ -535,7 +554,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  }
 }
 #else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
  MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
@ -553,7 +572,7 @@ static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
  const size_t bsize = mi_page_block_size(page);
  const size_t psize = _mi_os_page_size();
  mi_assert_internal(bsize > psize);
-  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  mi_assert_internal(!page->memid.is_pinned);
  void* gpage = (uint8_t*)block + bsize - psize;
  mi_assert_internal(_mi_is_aligned(gpage, psize));
  _mi_os_unprotect(gpage, psize);
--- a/src/heap.c
+++ b/src/heap.c
@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file

 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"  // mi_prim_get_default_heap

-#include <string.h>  // memset, memcpy
-
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
 #pragma warning(disable:4204)  // non-constant aggregate initializer
 #endif
@ -58,8 +55,6 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(arg2);
  MI_UNUSED(pq);
  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id);
  mi_assert_expensive(_mi_page_is_valid(page));
  return true;
 }
@ -68,6 +63,9 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 static bool mi_heap_is_valid(mi_heap_t* heap) {
  mi_assert_internal(heap!=NULL);
  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    mi_assert_internal(_mi_page_queue_is_valid(heap, &heap->pages[bin]));
+  }
  return true;
 }
 #endif
@ -98,7 +96,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  if (mi_page_all_free(page)) {
    // no more used blocks, free the page.
    // note: this will free retired pages as well.
-    _mi_page_free(page, pq, collect >= MI_FORCE);
+    _mi_page_free(page, pq);
  }
  else if (collect == MI_ABANDON) {
    // still used blocks but the thread is done; abandon the page
@ -107,66 +105,29 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  return true; // don't break
 }

-static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-  return true; // don't break
-}

 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  mi_assert_expensive(mi_heap_is_valid(heap));

  const bool force = (collect >= MI_FORCE);
  _mi_deferred_free(heap, force);

  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
-  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
-
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  if (
-  #ifdef NDEBUG
-      collect == MI_FORCE
-  #else
-      collect >= MI_FORCE
-  #endif
-    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
-  {
-    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-    // if all memory is freed by now, all segments should be freed.
-    // note: this only collects in the current subprocess
-    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
-  }
-
-  // if abandoning, mark all pages to no longer add to delayed_free
-  if (collect == MI_ABANDON) {
-    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
-  }
-
-  // free all current thread delayed blocks.
-  // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  _mi_heap_delayed_free_all(heap);
+  // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());

  // collect retired pages
  _mi_heap_collect_retired(heap, force);

+  // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); }
+
  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
-
-  // collect segments (purge pages, this can be expensive so don't force on abandonment)
-  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
-
-  // if forced, collect thread data cache on program-exit (or shared library unload)
-  if (force && is_main_thread && mi_heap_is_backing(heap)) {
-    _mi_thread_data_collect();  // collect thread data cache
-  }

  // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
+  //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, heap->tld);

  // merge statistics
  if (collect <= MI_FORCE) {
@ -192,8 +153,12 @@ void mi_collect(bool force) mi_attr_noexcept {
 ----------------------------------------------------------- */

 mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
-  return mi_prim_get_default_heap();
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    mi_thread_init();
+    heap = mi_prim_get_default_heap();
+  }
+  return heap;
 }

 static bool mi_heap_is_default(const mi_heap_t* heap) {
@ -206,52 +171,90 @@ mi_heap_t* mi_heap_get_backing(void) {
  mi_assert_internal(heap!=NULL);
  mi_heap_t* bheap = heap->tld->heap_backing;
  mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  mi_assert_internal(bheap->tld->thread_id == _mi_thread_id());
  return bheap;
 }

-void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
+// todo: make order of parameters consistent (but would that break compat with CPython?)
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, uint8_t heap_tag, mi_tld_t* tld)
+{
+  mi_assert_internal(heap!=NULL);
+  mi_memid_t memid = heap->memid;
  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;
-  heap->thread_id  = _mi_thread_id();
-  heap->arena_id   = arena_id;
-  heap->no_reclaim = noreclaim;
-  heap->tag        = tag;
-  if (heap == tld->heap_backing) {
+  heap->memid = memid;
+  heap->tld        = tld;  // avoid reading the thread-local tld during initialization
+  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
+  heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0);
+  heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
+  heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+  heap->tag        = heap_tag;
+  if (heap->tld->is_in_threadpool) {
+    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
+    // this is checked in `free.c:mi_free_try_collect_mt`
+    // .. but abandoning is good in this case: halve the full page retain (possibly to 0)
+    // (so blocked threads do not hold on to too much memory)
+    if (heap->page_full_retain > 0) {
+      heap->page_full_retain = heap->page_full_retain / 4;
+    }
+  }
+
+  if (heap->tld->heap_backing == NULL) {
+    heap->tld->heap_backing = heap;  // first heap becomes the backing heap
    _mi_random_init(&heap->random);
  }
  else {
-    _mi_random_split(&tld->heap_backing->random, &heap->random);
+    _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
  }
  heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
+  //heap->keys[0] = _mi_heap_random_next(heap);
+  //heap->keys[1] = _mi_heap_random_next(heap);*/
  _mi_heap_guarded_init(heap);
+
  // push on the thread local heaps list
  heap->next = heap->tld->heaps;
  heap->tld->heaps = heap;
 }

-mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap == NULL) return NULL;
+mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld) {
+  mi_assert_internal(tld!=NULL);
  mi_assert(heap_tag >= 0 && heap_tag < 256);
-  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
+  // allocate and initialize a heap
+  mi_memid_t memid;
+  mi_heap_t* heap;
+  if (arena_id == _mi_arena_id_none()) {
+    heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  }
+  else {
+    // heaps associated wita a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+  }
+  if (heap==NULL) {
+    _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
+    return NULL;
+  }
+  heap->memid = memid;
+  _mi_heap_init(heap, arena_id, allow_destroy, (uint8_t)heap_tag, tld);
  return heap;
 }

+mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_assert_internal(bheap != NULL);
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
+}
+
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
-  return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id);
+  return mi_heap_new_ex(0 /* default heap tag */, false /* allow destroy? */, arena_id);
 }

 mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
  // don't reclaim abandoned memory or otherwise destroy is unsafe
-  return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none());
+  return mi_heap_new_ex(0 /* default heap tag */, true /* allow destroy? */, _mi_arena_id_none());
 }

 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+  return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena);
 }

 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
@ -263,14 +266,14 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
  mi_assert_internal(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+  _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
-  heap->thread_delayed_free = NULL;
+  // heap->thread_delayed_free = NULL;
  heap->page_count = 0;
 }

 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
+static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) {
  mi_assert(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
@ -297,7 +300,9 @@ static void mi_heap_free(mi_heap_t* heap) {
  mi_assert_internal(heap->tld->heaps != NULL);

  // and free the used memory
-  mi_free(heap);
+  if (do_free_mem) {
+    _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  }
 }

 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@ -324,24 +329,24 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(pq);

  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  //_mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);

  // stats
  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, malloc_huge, bsize);
  }
-#if (MI_STAT)
+  #if (MI_STAT)
  _mi_page_free_collect(page, false);  // update used count
  const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
    mi_heap_stat_decrease(heap, malloc_normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
    mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
  }
  mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif

  /// pretend it is all free now
  mi_assert_internal(mi_page_thread_free(page) == NULL);
@ -351,7 +356,8 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  // mi_page_free(page,false);
  page->next = NULL;
  page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+  mi_page_set_heap(page, NULL);
+  _mi_arenas_page_free(page);

  return true; // keep going
 }
@ -372,7 +378,8 @@ static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_he
 void mi_heap_destroy(mi_heap_t* heap) {
  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
+  mi_assert(!heap->allow_page_reclaim);
+  mi_assert(!heap->allow_page_abandon);
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  #if MI_GUARDED
@ -380,9 +387,9 @@ void mi_heap_destroy(mi_heap_t* heap) {
  mi_heap_delete(heap);
  return;
  #else
-  if (!heap->no_reclaim) {
+  if (heap->allow_page_reclaim) {
    _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap);
-    // don't free in case it may contain reclaimed pages
+    // don't free in case it may contain reclaimed pages,
    mi_heap_delete(heap);
  }
  else {
@ -392,7 +399,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
    #endif
    // free all pages
    _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+    mi_heap_free(heap,true);
  }
  #endif
 }
@ -404,7 +411,7 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
  mi_heap_t* curr = heap->tld->heaps;
  while (curr != NULL) {
    mi_heap_t* next = curr->next;
-    if (curr->no_reclaim) {
+    if (!curr->allow_page_reclaim) {
      mi_heap_destroy(curr);
    }
    else {
@ -419,44 +426,30 @@ void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
 ----------------------------------------------------------- */

 // Transfer the pages from one heap to the other
-static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
-  mi_assert_internal(heap!=NULL);
-  if (from==NULL || from->page_count == 0) return;
+//static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+//  mi_assert_internal(heap!=NULL);
+//  if (from==NULL || from->page_count == 0) return;
+//
+//  // transfer all pages by appending the queues; this will set a new heap field
+//  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+//    mi_page_queue_t* pq = &heap->pages[i];
+//    mi_page_queue_t* append = &from->pages[i];
+//    size_t pcount = _mi_page_queue_append(heap, pq, append);
+//    heap->page_count += pcount;
+//    from->page_count -= pcount;
+//  }
+//  mi_assert_internal(from->page_count == 0);
+//
+//  // and reset the `from` heap
+//  mi_heap_reset_pages(from);
+//}

-  // reduce the size of the delayed frees
-  _mi_heap_delayed_free_partial(from);
-
-  // transfer all pages by appending the queues; this will set a new heap field
-  // so threads may do delayed frees in either heap for a while.
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
-  // so after this only the new heap will get delayed frees
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_queue_t* append = &from->pages[i];
-    size_t pcount = _mi_page_queue_append(heap, pq, append);
-    heap->page_count += pcount;
-    from->page_count -= pcount;
-  }
-  mi_assert_internal(from->page_count == 0);
-
-  // and do outstanding delayed frees in the `from` heap
-  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
-  // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free_all(from);
-  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
-  #endif
-
-  // and reset the `from` heap
-  mi_heap_reset_pages(from);
-}
-
-// are two heaps compatible with respect to heap-tag, exclusive arena etc.
-static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
-  return (heap1->tag == heap2->tag &&                   // store same kind of objects
-          heap1->arena_id == heap2->arena_id);          // same arena preference
-}
+//// are two heaps compatible with respect to heap-tag, exclusive arena etc.
+//static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
+//  return (heap1->tag == heap2->tag &&                   // store same kind of objects
+//          heap1->tld->subproc == heap2->tld->subproc && // same sub-process
+//          heap1->arena_id == heap2->arena_id);          // same arena preference
+//}

 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
@ -466,17 +459,11 @@ void mi_heap_delete(mi_heap_t* heap)
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;

-  mi_heap_t* bheap = heap->tld->heap_backing;
-  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
-    // transfer still used pages to the backing heap
-    mi_heap_absorb(bheap, heap);
-  }
-  else {
-    // the backing heap abandons its pages
-    _mi_heap_collect_abandon(heap);
-  }
+  // abandon all pages
+  _mi_heap_collect_abandon(heap);
+
  mi_assert_internal(heap->page_count==0);
-  mi_heap_free(heap);
+  mi_heap_free(heap,true);
 }

 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
@ -490,7 +477,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 }


+/* -----------------------------------------------------------
+  Load/unload heaps
+----------------------------------------------------------- */
+void mi_heap_unload(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
+    return;
+  }

+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_heap_collect_abandon(heap);
+  mi_assert_internal(heap->page_count==0);
+
+  // remove from heap list
+  mi_heap_free(heap, false /* but don't actually free the memory */);
+
+  // disassociate from the current thread-local and static state
+  heap->tld = NULL;
+  return;
+}
+
+bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
+  mi_assert(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (heap->tld != NULL) {
+    _mi_warning_message("cannot reload heaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (heap->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(heap->page_count==0);
+
+  // re-associate with the current thread-local and static state
+  heap->tld = mi_heap_get_default()->tld;
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(heap->page_count == 0);
+  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
+    heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return true;
+}

 /* -----------------------------------------------------------
  Analysis
@ -499,11 +542,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 // static since it is not thread safe to access heaps from other threads.
 static mi_heap_t* mi_heap_of_block(const void* p) {
  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(valid);
-  if mi_unlikely(!valid) return NULL;
-  return mi_page_heap(_mi_segment_page_of(segment,p));
+  mi_page_t* page = _mi_ptr_page(p); // TODO: check pointer validity?
+  return mi_page_heap(page);
 }

 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
@ -578,7 +618,7 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
  if (page->used == 0) return true;

  size_t psize;
-  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* const pstart = mi_page_area(page, &psize);
  mi_heap_t* const heap = mi_page_heap(page);
  const size_t bsize    = mi_page_block_size(page);
  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
--- a/src/init.c
+++ b/src/init.c
@ -11,32 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit

+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)

 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0,
-  false, false, false, false,
-  0,       // capacity
-  0,       // reserved capacity
-  { 0 },   // flags
-  false,   // is_zero
-  0,       // retire_expire
-  NULL,    // free
-  NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // heap tag
-  0,       // block_size
-  NULL,    // page_start
+  MI_ATOMIC_VAR_INIT(MI_PAGE_IN_FULL_QUEUE),  // xthread_id  (must set flag to catch NULL on a free)
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // block size shift
+  0,                      // retire_expire
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  0,                      // block_size
+  NULL,                   // page_start
+  0,                      // heap tag
+  false,                  // is_zero
  #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
-  #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xheap
-  NULL, NULL
-  #if MI_INTPTR_SIZE==4
-  , { NULL }
+  { 0, 0 },               // keys
  #endif
+  NULL,                   // xheap
+  NULL, NULL,             // next, prev
+  MI_ARENA_SLICE_SIZE,    // page_committed
+  MI_MEMID_STATIC         // memid
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@ -51,7 +50,7 @@ const mi_page_t _mi_page_empty = {


 // Empty page queues for every bin
-#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define QNULL(sz)  { NULL, NULL, 0, (sz)*sizeof(uintptr_t) }
 #define MI_PAGE_QUEUES_EMPTY \
  { QNULL(1), \
    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
@ -63,8 +62,8 @@ const mi_page_t _mi_page_empty = {
    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }

 #define MI_STAT_COUNT_NULL()  {0,0,0}

@ -95,25 +94,83 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------

-mi_decl_hidden mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // tid
-  0,                // cookie
-  0,                // arena id
-  { 0, 0 },         // keys
-  { {0}, {0}, 0, true }, // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  0, 0,             // generic count
-  NULL,             // next
-  false,            // can reclaim
-  0,                // tag
+static mi_decl_cache_align mi_subproc_t subproc_main
+#if __cplusplus
+= { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+= { 0 };   // C zero initialize
+#endif
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,          // subproc
+  NULL,                   // heap_backing
+  NULL,                   // heaps list
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STAT_VERSION, MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+  &tld_empty,             // tld
+  NULL,                   // exclusive_arena
+  0,                      // cookie
+  //{ 0, 0 },               // keys
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0, 0,                   // generic count
+  NULL,                   // next
+  0,                      // full page retain
+  false,                  // can reclaim
+  true,                   // can eager abandon
+  0,                      // tag
  #if MI_GUARDED
-  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,          // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
  #endif
  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
+};
+
+extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,          // subproc
+  &heap_main,             // heap_backing
+  &heap_main,             // heaps list
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STAT_VERSION, MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align mi_heap_t heap_main = {
+  &tld_main,              // thread local data
+  NULL,                   // exclusive arena
+  0,                      // initial cookie
+  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0, 0,                   // generic count
+  NULL,                   // next heap
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  0,                      // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
 };


@ -124,40 +181,6 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;

-extern mi_decl_hidden mi_heap_t _mi_heap_main;
-
-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-static mi_decl_cache_align mi_tld_t tld_main = {
-  0, false,
-  &_mi_heap_main, &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0, &mi_subproc_default,
-    &tld_main.stats
-  }, // segments
-  { MI_STAT_VERSION, MI_STATS_NULL }       // stats
-};
-
-mi_decl_cache_align mi_heap_t _mi_heap_main = {
-  &tld_main,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // thread id
-  0,                // initial cookie
-  0,                // arena id
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0, true },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  0, 0,             // generic count
-  NULL,             // next heap
-  false,            // can reclaim
-  0,                // tag
-  #if MI_GUARDED
-  0, 0, 0, 0, 0,
-  #endif
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
-};

 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.

@ -173,7 +196,7 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp
  if (heap->guarded_sample_rate >= 1) {
    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
  }
-  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+  heap->guarded_sample_count = 1 + heap->guarded_sample_seed;  // count down samples
 }

 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@ -202,28 +225,146 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 }
 #endif

+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    mi_lock_init(&subproc_main.os_abandoned_pages_lock);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
+  }
+}

+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
+  }
+}
+
+// Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = 1;
-    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+  if (heap_main.cookie == 0) {
+    // heap
+    heap_main.cookie = 1;
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
    #else
-      _mi_random_init(&_mi_heap_main.random);
+      _mi_random_init(&heap_main.random);
    #endif
-    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
-    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
-    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
-    _mi_heap_guarded_init(&_mi_heap_main);
+    heap_main.cookie  = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
+    _mi_heap_guarded_init(&heap_main);
+    heap_main.allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0);
+    heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
+    heap_main.page_full_retain   = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+
+    mi_subproc_main_init();
+    mi_tld_main_init();
  }
 }

 mi_heap_t* _mi_heap_main_get(void) {
  mi_heap_main_init();
-  return &_mi_heap_main;
+  return &heap_main;
+}
+
+
+/* -----------------------------------------------------------
+  Thread local data
+----------------------------------------------------------- */
+
+// Count current and total created threads
+static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
+static _Atomic(size_t)  thread_total_count;
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
+
+// The mimalloc thread local data
+mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
+
+// Allocate fresh tld
+static mi_tld_t* mi_tld_alloc(void) {
+  mi_atomic_increment_relaxed(&thread_count);
+  if (_mi_is_main_thread()) {
+    return &tld_main;
+  }
+  else {
+    // allocate tld meta-data
+    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
+    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
+    mi_memid_t memid;
+    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
+    if (tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
+      return NULL;
+    }
+    tld->memid = memid;
+    tld->heap_backing = NULL;
+    tld->heaps = NULL;
+    tld->subproc = &subproc_main;
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
+    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
+    return tld;
+  }
+}
+
+#define MI_TLD_INVALID  ((mi_tld_t*)1)
+
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    _mi_stats_done(&tld->stats);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  }
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
+  mi_atomic_decrement_relaxed(&thread_count);
+}
+
+static mi_tld_t* mi_tld(void) {
+  mi_tld_t* tld = thread_tld;
+  if (tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
+    thread_tld = &tld_empty;
+  }
+  if (tld==&tld_empty) {
+    thread_tld = tld = mi_tld_alloc();
+  }
+  return tld;
+}
+
+mi_subproc_t* _mi_subproc(void) {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL) {
+    return _mi_subproc_main();
+  }
+  else {
+    return heap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
+  }
+}
+
+
+mi_tld_t* _mi_thread_tld(void) mi_attr_noexcept {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL) {
+    return &tld_empty;
+  }
+  else {
+    return heap->tld;
+  }
 }


@ -231,179 +372,99 @@ mi_heap_t* _mi_heap_main_get(void) {
  Sub process
 ----------------------------------------------------------- */

+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
 mi_subproc_id_t mi_subproc_main(void) {
  return NULL;
 }

 mi_subproc_id_t mi_subproc_new(void) {
-  mi_memid_t memid = _mi_memid_none();
-  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  mi_memid_t memid;
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
  if (subproc == NULL) return NULL;
  subproc->memid = memid;
-  subproc->abandoned_os_list = NULL;
-  mi_lock_init(&subproc->abandoned_os_lock);
-  mi_lock_init(&subproc->abandoned_os_visit_lock);
+  mi_lock_init(&subproc->os_abandoned_pages_lock);
+  mi_lock_init(&subproc->arena_reserve_lock);
  return subproc;
 }

 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
-  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }

 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
  if (subproc_id == NULL) return;
  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
-  // check if there are no abandoned segments still..
+  // check if there are os pages still..
  bool safe_to_delete = false;
-  mi_lock(&subproc->abandoned_os_lock) {
-    if (subproc->abandoned_os_list == NULL) {
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    if (subproc->os_abandoned_pages == NULL) {
      safe_to_delete = true;
    }
  }
  if (!safe_to_delete) return;
+
+  // merge stats back into the main subproc?
+  _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats);
+
  // safe to release
  // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->abandoned_os_lock);
-  mi_lock_done(&subproc->abandoned_os_visit_lock);
-  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+  mi_lock_done(&subproc->os_abandoned_pages_lock);
+  mi_lock_done(&subproc->arena_reserve_lock);
+  _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }

 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_heap_t* heap = mi_heap_get_default();
-  if (heap == NULL) return;
-  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
-  if (heap->tld->segments.subproc != &mi_subproc_default) return;
-  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* tld = mi_tld();
+  if (tld == NULL) return;
+  mi_assert(tld->subproc == &subproc_main);
+  if (tld->subproc != &subproc_main) return;
+  tld->subproc = _mi_subproc_from_id(subproc_id);
 }


-
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Allocate heap data
 ----------------------------------------------------------- */

-// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
-typedef struct mi_thread_data_s {
-  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-  mi_memid_t memid;  // must come last due to zero'ing
-} mi_thread_data_t;
-
-
-// Thread meta-data is allocated directly from the OS. For
-// some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead
-// per thread so we maintain a small cache of recently freed metadata.
-
-#define TD_CACHE_SIZE (32)
-static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
-
-static mi_thread_data_t* mi_thread_data_zalloc(void) {
-  // try to find thread metadata in the cache
-  bool is_zero = false;
-  mi_thread_data_t* td = NULL;
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      // found cached allocation, try use it
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        break;
-      }
-    }
-  }
-
-  // if that fails, allocate as meta data
-  if (td == NULL) {
-    mi_memid_t memid;
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      }
-    }
-    if (td != NULL) {
-      td->memid = memid;
-      is_zero = memid.initially_zero;
-    }
-  }
-
-  if (td != NULL && !is_zero) {
-    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
-  }
-  return td;
-}
-
-static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
-  // try to add the thread metadata to the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td == NULL) {
-      mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
-        return;
-      }
-    }
-  }
-  // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid);
-}
-
-void _mi_thread_data_collect(void) {
-  // free all thread metadata from the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid);
-      }
-    }
-  }
-}
-
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_thread_heap_init(void) {
  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // mi_assert_internal(heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
    // the main heap is statically allocated
    mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
+    _mi_heap_set_default_direct(&heap_main);
    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_zalloc();
-    if (td == NULL) return false;
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
+    mi_tld_t* tld = mi_tld_alloc();

-    mi_tld_t*  tld = &td->tld;
-    mi_heap_t* heap = &td->heap;
-    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
-    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
+    // allocate and initialize the heap
+    mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
+
+    // associate the heap with this thread
+    // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation)
    _mi_heap_set_default_direct(heap);
+
+    // now that the heap is set for this thread, we can set the thread-local tld.
+    thread_tld = tld;
  }
  return false;
 }

-// initialize thread local data
-void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
-  tld->heap_backing = bheap;
-  tld->heaps = NULL;
-  tld->segments.subproc = &mi_subproc_default;
-  tld->segments.stats = &tld->stats;
-}

 // Free the thread local default heap (called from `mi_thread_done`)
 static bool _mi_thread_heap_done(mi_heap_t* heap) {
  if (!mi_heap_is_initialized(heap)) return true;

  // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty);

  // switch to backing heap
  heap = heap->tld->heap_backing;
@ -423,26 +484,22 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
  mi_assert_internal(mi_heap_is_backing(heap));

  // collect if not the main thread
-  if (heap != &_mi_heap_main) {
+  if (heap != &heap_main) {
    _mi_heap_collect_abandon(heap);
  }

-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
+  // free heap meta data
+  _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);

-  // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    mi_thread_data_free((mi_thread_data_t*)heap);
-  }
-  else {
+  if (heap == &heap_main) {
    #if 0
    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
    _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    mi_assert_internal(heap->tld->heap_backing == &heap_main);
    #endif
  }
+
  return false;
 }

@ -456,7 +513,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //
@ -470,19 +527,14 @@ static void mi_process_setup_auto_thread_done(void) {
  if (tls_initialized) return;
  tls_initialized = true;
  _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_heap_set_default_direct(&heap_main);
 }


 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }

-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}

 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@ -495,8 +547,7 @@ void mi_thread_init(void) mi_attr_noexcept
  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_thread_heap_init()) return;  // returns true if already initialized

-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }

@ -518,14 +569,18 @@ void _mi_thread_done(mi_heap_t* heap)
  }

  // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);

  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
+  if (heap->tld->thread_id != _mi_prim_thread_id()) return;

  // abandon the thread local heap
-  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* tld = heap->tld;
+  _mi_thread_heap_done(heap);  // returns true if already ran
+
+  // free thread local data
+  mi_tld_free(tld);
 }

 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@ -546,7 +601,10 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 }

 void mi_thread_set_in_threadpool(void) mi_attr_noexcept {
-  // nothing
+  mi_tld_t* tld = mi_tld();
+  if (tld!=NULL) {
+    tld->is_in_threadpool = true;
+  }
 }

 // --------------------------------------------------------
@ -586,7 +644,7 @@ void _mi_process_load(void) {
  }

  // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&heap_main.random);
 }

 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@ -613,19 +671,32 @@ void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  static mi_atomic_once_t process_init;
 	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main
 	#endif
  if (!mi_atomic_once(&process_init)) return;
  _mi_process_is_initialized = true;
  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
-  mi_process_setup_auto_thread_done();

  mi_detect_cpu_features();
  _mi_os_init();
+  _mi_page_map_init();
  mi_heap_main_init();
+  mi_tld_main_init();
+  // the following two can potentially allocate (on freeBSD for locks and thread keys)
+  mi_subproc_main_init();
+  mi_process_setup_auto_thread_done();
+
+  #if MI_DEBUG
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
+  #if MI_TSAN
+  _mi_verbose_message("thread santizer enabled\n");
+  #endif
  mi_thread_init();

-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
@ -684,15 +755,14 @@ void mi_cdecl _mi_process_done(void) {
  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
    mi_heap_collect(heap, true /* force */);
    _mi_heap_unsafe_destroy_all(heap);     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all();
-    _mi_segment_map_unsafe_destroy();
+    _mi_arenas_unsafe_destroy_all(heap->tld);
  }

  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
  _mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
  os_preloading = true; // don't call the C runtime anymore
 }

--- a/src/libc.c
+++ b/src/libc.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -171,7 +171,18 @@ int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
    char c;
    MI_NEXTC();
    if (c != '%') {
-      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+      if (c == '\\') {
+        MI_NEXTC();
+        switch (c) {
+        case 'e': mi_outc('\x1B', &out, end); break;
+        case 't': mi_outc('\t', &out, end); break;
+        case 'n': mi_outc('\n', &out, end); break;
+        case 'r': mi_outc('\r', &out, end); break;
+        case '\\': mi_outc('\\', &out, end); break;
+        default: /* ignore */ break;
+        }
+      }
+      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
        mi_outc(c, &out, end);
      }
    }
@ -199,7 +210,10 @@ int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      }

      char* start = out;
-      if (c == 's') {
+      if (c == '%') {
+        mi_outc('%', &out, end);
+      }
+      else if (c == 's') {
        // string
        const char* s = va_arg(args, const char*);
        mi_outs(s, &out, end);
@ -275,3 +289,127 @@ int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
  va_end(args);
  return written;
 }
+
+
+
+// --------------------------------------------------------
+// generic trailing and leading zero count, and popcount
+// --------------------------------------------------------
+
+#if !MI_HAS_FAST_BITSCAN
+
+static size_t mi_ctz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
+}
+
+static size_t mi_clz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
+}
+
+size_t _mi_ctz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
+}
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
+  #endif
+}
+
+#endif // bit scan
+
+#if !MI_HAS_FAST_POPCOUNT
+
+#if MI_SIZE_SIZE == 4
+#define mi_mask_even_bits32      (0x55555555)
+#define mi_mask_even_pairs32     (0x33333333)
+#define mi_mask_even_nibbles32   (0x0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum32(uint32_t x) {
+  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
+  x += (x << 8);
+  x += (x << 16);
+  return (size_t)(x >> 24);
+}
+
+static size_t mi_popcount_generic32(uint32_t x) {
+  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
+  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
+  // into the lower bit-pair:
+  x = x - ((x >> 1) & mi_mask_even_bits32);
+  // add the 2-bit pair results
+  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
+  // add the 4-bit nibble results
+  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
+  // each byte now has a count of its bits, we can sum them now:
+  return mi_byte_sum32(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic32(x);
+}
+
+#else
+#define mi_mask_even_bits64      (0x5555555555555555)
+#define mi_mask_even_pairs64     (0x3333333333333333)
+#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum64(uint64_t x) {
+  x += (x << 8);
+  x += (x << 16);
+  x += (x << 32);
+  return (size_t)(x >> 56);
+}
+
+static size_t mi_popcount_generic64(uint64_t x) {
+  x = x - ((x >> 1) & mi_mask_even_bits64);
+  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
+  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
+  return mi_byte_sum64(x);
+}
+
+size_t _mi_popcount_generic(size_t x) {
+  return mi_popcount_generic64(x);
+}
+#endif
+
+#endif // popcount
--- a/src/options.c
+++ b/src/options.c
@ -102,6 +102,14 @@ typedef struct mi_option_desc_s {
 #endif
 #endif

+#ifndef MI_DEFAULT_PAGEMAP_COMMIT
+#if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
+#define MI_DEFAULT_PAGEMAP_COMMIT 1
+#else
+#define MI_DEFAULT_PAGEMAP_COMMIT 0
+#endif
+#endif
+

 static mi_option_desc_t options[_mi_option_last] =
 {
@ -136,18 +144,17 @@ static mi_option_desc_t options[_mi_option_last] =
 #else
  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
-  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
  { 32,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
  { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
+  { 10,  UNINIT, MI_OPTION(deprecated_max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
  { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
-  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION_LEGACY(deprecated_purge_extend_delay, decommit_extend_delay) },
  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
@ -161,8 +168,15 @@ static mi_option_desc_t options[_mi_option_last] =
  { MI_DEFAULT_GUARDED_SAMPLE_RATE,
         UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
  { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
-  { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
-  { 10000, UNINIT, MI_OPTION(generic_collect) },          // collect heaps every N (=10000) generic allocation calls
+  { 10000, UNINIT, MI_OPTION(generic_collect) },        // collect heaps every N (=10000) generic allocation calls
+  { 0,   UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim abandoned pages on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps
+  { 2,   UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
+  { 4,   UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
+  { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
+  { MI_DEFAULT_PAGEMAP_COMMIT,
+         UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
+  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
+  { 16,  UNINIT, MI_OPTION(page_reclaim_max) },         // don't reclaim pages if we already own N pages (in that size class)
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -455,7 +469,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[992];
  if (fmt==NULL) return;
  if (!mi_recurse_enter()) return;
  _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
@ -481,6 +495,13 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
  }
 }

+void _mi_raw_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, NULL, NULL, fmt, args);
+  va_end(args);
+}
+
 void _mi_message(const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
--- a/src/os.c
+++ b/src/os.c
@ -9,21 +9,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"

-#define mi_os_stat_increase(stat,amount)      _mi_stat_increase(&_mi_stats_main.stat, amount)
-#define mi_os_stat_decrease(stat,amount)      _mi_stat_decrease(&_mi_stats_main.stat, amount)
-#define mi_os_stat_counter_increase(stat,inc) _mi_stat_counter_increase(&_mi_stats_main.stat, inc)
-
 /* -----------------------------------------------------------
  Initialization.
 ----------------------------------------------------------- */
-#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
-#if MI_INTPTR_SIZE < 8
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS     32
-#else
-#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS     48
-#endif
-#endif
-
 #ifndef MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB
 #if MI_INTPTR_SIZE < 8
 #define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB   4*MI_MiB    // 4 GiB
@ -37,7 +25,7 @@ static mi_os_mem_config_t mi_os_mem_config = {
  0,        // large page size (usually 2MiB)
  4096,     // allocation granularity
  MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB,
-  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  MI_MAX_VABITS, // in `bits.h`
  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
  true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
@ -62,6 +50,18 @@ size_t _mi_os_large_page_size(void) {
  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }

+size_t _mi_os_guard_page_size(void) {
+  const size_t gsize = _mi_os_page_size();
+  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/8));
+  return gsize;
+}
+
+size_t _mi_os_virtual_address_bits(void) {
+  const size_t vbits = mi_os_mem_config.virtual_address_bits;
+  mi_assert(vbits <= MI_MAX_VABITS);
+  return vbits;
+}
+
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
  // if we have access, check the size and alignment requirements
  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
@ -91,73 +91,54 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero);

-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}
-
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
-}
-
-
-/* -----------------------------------------------------------
-  aligned hinting
-------------------------------------------------------------- */
-
-// On systems with enough virtual address bits, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
-// space (64TiB) we use this technique. (but see issue #939)
-#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
  MI_UNUSED(try_alignment); MI_UNUSED(size);
  return NULL;
 }
-#endif
+
+// In secure mode, return the size of a guard page, otherwise 0
+size_t _mi_os_secure_guard_page_size(void) {
+  #if MI_SECURE > 0
+  return _mi_os_guard_page_size();
+  #else
+  return 0;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  const bool ok = (is_pinned ? false : _mi_os_decommit(addr, _mi_os_secure_guard_page_size()));
+  if (!ok) {
+    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
+  }
+  return ok;
+  #else
+  MI_UNUSED(is_pinned);
+  return true;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned) {
+  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), is_pinned);
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_at(void* addr) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
+  #else
+  return true;
+  #endif
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_before(void* addr) {
+  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size());
+}

 /* -----------------------------------------------------------
  Free memory
@ -186,10 +167,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
    void* base = addr;
    // different base? (due to alignment)
    if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);      
+      mi_assert(memid.mem.os.base <= addr);
      base = memid.mem.os.base;
      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) { 
+      if (memid.mem.os.size==0) {
        csize += diff;
      }
      if (still_committed) {
@ -236,8 +217,6 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
  }

-
-
  mi_os_stat_counter_increase(mmap_calls, 1);
  if (p != NULL) {
    mi_os_stat_increase(reserved, size);
@ -270,18 +249,24 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());

-  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
-  if (p == NULL) return NULL;
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
+
+  void* p = NULL;
+  if (try_direct_alloc) {
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  }

  // aligned already?
-  if (((uintptr_t)p % alignment) == 0) {
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
    *base = p;
  }
  else {
    // if not aligned, free it, overallocate, and unmap around it
    #if !MI_TRACK_ASAN
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    if (try_direct_alloc) {
+      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    }
    #endif
    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0)); }
    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
@ -293,10 +278,10 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;

      // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
      *base = p; // remember the base
-      p = mi_align_up_ptr(p, alignment);
+      p = _mi_align_up_ptr(p, alignment);

      // explicitly commit only the aligned part
      if (commit) {
@ -309,7 +294,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;

      // and selectively unmap parts around the over-allocated area.
-      void* aligned_p = mi_align_up_ptr(p, alignment);
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
      size_t post_size = over_size - pre_size - mid_size;
@ -339,7 +324,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
  bool os_is_zero  = false;
  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
  }
  return p;
 }
@ -355,9 +340,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  bool os_is_large = false;
  bool os_is_zero  = false;
  void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base);
  if (p != NULL) {
-    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
    memid->mem.os.base = os_base;
    // memid->mem.os.alignment = alignment;
    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
@ -365,6 +350,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  return p;
 }

+void* _mi_os_zalloc(size_t size, mi_memid_t* memid) {
+  void* p = _mi_os_alloc(size, memid);
+  if (p == NULL) return NULL;
+
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
+  return p;
+}
+
 /* -----------------------------------------------------------
  OS aligned allocation with an offset. This is used
  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@ -374,11 +371,9 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 ----------------------------------------------------------- */

 void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
-  mi_assert(offset <= MI_SEGMENT_SIZE);
  mi_assert(offset <= size);
  mi_assert((alignment % _mi_os_page_size()) == 0);
  *memid = _mi_memid_none();
-  if (offset > MI_SEGMENT_SIZE) return NULL;
  if (offset == 0) {
    // regular aligned allocation
    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
@ -411,11 +406,11 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
  if (newsize != NULL) *newsize = 0;
  if (size == 0 || addr == NULL) return NULL;

-  // page align conservatively within the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+  // page align conservatively within the range, or liberally straddling pages outside the range
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
    : mi_align_down_ptr(addr, _mi_os_page_size()));
  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+    : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
  if (diff <= 0) return NULL;

@ -526,7 +521,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
    return needs_recommit;
  }
  else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
      _mi_os_reset(p, size);
    }
    return false;  // needs no recommit
@ -591,15 +586,14 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
    start = huge_start;
    if (start == 0) {
      // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
    #endif
    }
    end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));

  if (total_size != NULL) *total_size = size;
  return (uint8_t*)start;
@ -612,7 +606,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif

-// Allocate MI_SEGMENT_SIZE aligned huge pages
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
  *memid = _mi_memid_none();
  if (psize != NULL) *psize = 0;
@ -674,7 +668,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
  if (page != 0) {
    mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(start, *psize, true /* is committed */, all_zero, true /* is_large */);
    memid->memkind = MI_MEM_OS_HUGE;
    mi_assert(memid->is_pinned);
    #ifdef MI_TRACK_ASAN
@ -727,3 +721,49 @@ int _mi_os_numa_node_get(void) {
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
  return (int)numa_node;
 }
+
+
+/* ----------------------------------------------------------------------------
+  Public API
+-----------------------------------------------------------------------------*/
+#if 0
+mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
+  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
+}
+
+static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  mi_memid_t memid = _mi_memid_none();
+  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
+  if (p == NULL) return p;
+  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
+  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
+  if (base != NULL) { *base = memid.mem.os.base;  }
+  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
+  if (!memid.initially_zero && memid.initially_committed) {
+    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
+  }
+  return p;
+}
+
+mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
+}
+
+mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
+}
+
+mi_decl_export void  mi_os_free(void* p, size_t size) {
+  if (p==NULL || size == 0) return;
+  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
+  _mi_os_free(p, size, memid);
+}
+
+mi_decl_export void  mi_os_commit(void* p, size_t size) {
+  _mi_os_commit(p, size, NULL);
+}
+
+mi_decl_export void  mi_os_decommit(void* p, size_t size) {
+  _mi_os_decommit(p, size);
+}
+#endif
--- a/src/page-map.c
+++ b/src/page-map.c
@ -0,0 +1,334 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+#if MI_PAGE_MAP_FLAT 
+
+// The page-map contains a byte for each 64kb slice in the address space.
+// For an address `a` where `ofs = _mi_page_map[a >> 16]`:
+// 0 = unused
+// 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
+// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
+//
+// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
+// A full 256 TiB address space (48 bit) needs a 4 GiB page map.
+// A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
+
+mi_decl_cache_align uint8_t* _mi_page_map = NULL;
+static void*        mi_page_map_max_address = NULL;
+static mi_memid_t   mi_page_map_memid;
+
+#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
+static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
+
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
+  const size_t reserve_size = bitmap_size + page_map_size;
+  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (base==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(base, reserve_size);
+  }
+  if (bitmap_size > 0) {
+    mi_page_map_commit = (mi_bitmap_t*)base;
+    _mi_os_commit(mi_page_map_commit, bitmap_size, NULL);
+    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
+  }
+  _mi_page_map = base + bitmap_size;
+
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!commit) {
+    mi_page_map_ensure_committed(0, 1);
+  }
+  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  // we always set the commit bits so we can track what ranges are in-use.
+  // we only actually commit if the map wasn't committed fully already.
+  if (mi_page_map_commit != NULL) {
+    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
+        // this may race, in which case we do multiple commits (which is ok)        
+        bool is_zero;
+        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+        _mi_os_commit(start, size, &is_zero);
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }        
+        mi_bitmap_set(mi_page_map_commit, i);
+      }
+    }
+  }
+  #if MI_DEBUG > 0
+  _mi_page_map[idx] = 0;
+  _mi_page_map[idx+slice_count-1] = 0;
+  #endif
+}
+
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page);
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+
+  mi_page_map_ensure_committed(idx, slice_count);
+
+  // set the offsets
+  for (size_t i = 0; i < slice_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
+  }
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  const uintptr_t index = _mi_page_map_index(start);
+  mi_page_map_ensure_committed(index, slice_count); // we commit the range in total; todo: scan the commit bits and clear only those ranges?
+  _mi_memzero(&_mi_page_map[index], slice_count);
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  const uintptr_t idx = _mi_page_map_index(p);
+  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  const uintptr_t ofs = _mi_page_map[idx];
+  if mi_unlikely(ofs == 0) return NULL;
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#else
+
+// A 2-level page map
+#define MI_PAGE_MAP_SUB_SIZE    (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
+
+mi_decl_cache_align mi_page_t*** _mi_page_map;
+static void*        mi_page_map_max_address;
+static mi_memid_t   mi_page_map_memid;
+
+static _Atomic(mi_bfield_t)  mi_page_map_commit; 
+
+static mi_page_t** mi_page_map_ensure_committed(size_t idx);
+static mi_page_t** mi_page_map_ensure_at(size_t idx);
+static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_assert(MI_MAX_VABITS >= vbits);
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  mi_assert(page_map_count <= MI_PAGE_MAP_COUNT);
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size);
+  const size_t reserve_size = page_map_size + os_page_size;
+  const bool commit = page_map_size <= 64*MI_KiB || 
+                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit(); 
+  _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0)));
+
+  // note: for the NULL range we only commit one OS page (in the map and sub)
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(&_mi_page_map[0], os_page_size, NULL);  // commit first part of the map
+  }
+  _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved 2 sub maps at the end already
+  if (!mi_page_map_memid.initially_committed) {
+    _mi_os_commit(_mi_page_map[0], os_page_size, NULL);   // only first OS page
+  }
+  _mi_page_map[0][0] = (mi_page_t*)&_mi_page_empty;       // caught in `mi_free`
+  
+  mi_assert_internal(_mi_ptr_page(NULL)==&_mi_page_empty);
+  return true;
+}
+
+
+#define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS)
+
+static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
+  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
+  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT; 
+  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
+  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
+  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
+}
+
+static mi_page_t** mi_page_map_ensure_committed(size_t idx) {
+  size_t bit_idx;
+  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
+    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
+    _mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_page_t**), NULL);
+    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
+  }
+  return _mi_page_map[idx];
+}
+
+static mi_page_t** mi_page_map_ensure_at(size_t idx) {
+  mi_page_t** sub = mi_page_map_ensure_committed(idx);
+  if mi_unlikely(sub == NULL) {
+    // sub map not yet allocated, alloc now
+    mi_memid_t memid;
+    sub = (mi_page_t**)_mi_os_alloc(MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), &memid);
+    mi_page_t** expect = NULL;
+    if (!mi_atomic_cas_strong_acq_rel(((_Atomic(mi_page_t**)*)&_mi_page_map[idx]), &expect, sub)) {
+      // another thread already allocated it.. free and continue
+      _mi_os_free(sub, MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), memid);
+      sub = expect;
+      mi_assert_internal(sub!=NULL);
+    }
+    if (sub == NULL) {
+      _mi_error_message(EFAULT, "internal error: unable to extend the page map\n");
+    }
+  }
+  return sub;
+}
+
+static void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  while (slice_count > 0) {
+    mi_page_t** sub = mi_page_map_ensure_at(idx);
+    // set the offsets for the page
+    while (sub_idx < MI_PAGE_MAP_SUB_COUNT) {
+      sub[sub_idx] = page;
+      slice_count--; if (slice_count == 0) return;      
+      sub_idx++;      
+    }
+    idx++; // potentially wrap around to the next idx
+    sub_idx = 0; 
+  }
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page, sub_idx);
+}
+
+void _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  if mi_unlikely(_mi_page_map == NULL) return;
+  // get index and count
+  size_t slice_count;
+  size_t sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  if mi_unlikely(_mi_page_map == NULL) return;
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
+}
+
+// Return NULL for invalid pointers
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if (p==NULL) return NULL;
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p,&sub_idx);
+  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub==NULL) return NULL;
+  return sub[sub_idx];
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#endif
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"     
+#include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@ -38,15 +38,19 @@ terms of the MIT license. A copy of the license can be found in the file


 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }

 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }

 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
+}
+
+static inline size_t mi_page_queue_count(const mi_page_queue_t* pq) {
+  return pq->count;
 }

 /* -----------------------------------------------------------
@ -57,8 +61,8 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-static inline size_t mi_bin(size_t size) {
-  size_t wsize = _mi_wsize_from_size(size);  
+static mi_decl_noinline size_t mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
 #if defined(MI_ALIGN4W)
  if mi_likely(wsize <= 4) {
    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
@ -72,7 +76,7 @@ static inline size_t mi_bin(size_t size) {
    return (wsize == 0 ? 1 : wsize);
  }
 #endif
-  else if mi_unlikely(wsize > MI_LARGE_OBJ_WSIZE_MAX) {
+  else if mi_unlikely(wsize > MI_LARGE_MAX_OBJ_WSIZE) {
    return MI_BIN_HUGE;
  }
  else {
@ -106,8 +110,8 @@ size_t _mi_bin_size(size_t bin) {
 }

 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
  }
  else {
@ -136,6 +140,34 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif

+bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  MI_UNUSED_RELEASE(heap);
+  if (pq==NULL) return false;
+  size_t count = 0; MI_UNUSED_RELEASE(count);
+  mi_page_t* prev = NULL; MI_UNUSED_RELEASE(prev);
+  for (mi_page_t* page = pq->first; page != NULL; page = page->next) {
+    mi_assert_internal(page->prev == prev);
+    if (mi_page_is_in_full(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 2);
+    }
+    else if (mi_page_is_huge(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 1);
+    }
+    else {
+      mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    }
+    mi_assert_internal(page->heap == heap);
+    if (page->next == NULL) {
+      mi_assert_internal(pq->last == page);
+    }
+    count++;
+    prev = page;
+  }
+  mi_assert_internal(pq->count == count);
+  return true;
+}
+
+
 static size_t mi_page_bin(const mi_page_t* page) {
  const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
  mi_assert_internal(bin <= MI_BIN_FULL);
@ -210,8 +242,9 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
-                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+  mi_assert_internal(queue->count >= 1);
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_heap_t* heap = mi_page_heap(page);
  if (page->prev != NULL) page->prev->next = page->next;
@ -224,9 +257,9 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
    mi_heap_queue_first_update(heap,queue);
  }
  heap->page_count--;
+  queue->count--;
  page->next = NULL;
  page->prev = NULL;
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }

@ -242,7 +275,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -253,12 +286,42 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
  else {
    queue->first = queue->last = page;
  }
+  queue->count++;

  // update direct
  mi_heap_queue_first_update(heap, queue);
  heap->page_count++;
 }

+static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+
+  page->prev = queue->last;
+  page->next = NULL;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = page;
+    queue->last = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+  queue->count++;
+
+  // update direct
+  if (queue->first == page) {
+    mi_heap_queue_first_update(heap, queue);
+  }
+  heap->page_count++;
+}
+
 static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(mi_page_queue_contains(queue, page));
@ -270,6 +333,7 @@ static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue,

 static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) {
  mi_assert_internal(page != NULL);
+  mi_assert_internal(from->count >= 1);
  mi_assert_expensive(mi_page_queue_contains(from, page));
  mi_assert_expensive(!mi_page_queue_contains(to, page));
  const size_t bsize = mi_page_block_size(page);
@ -292,8 +356,10 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
    mi_assert_internal(mi_heap_contains_queue(heap, from));
    mi_heap_queue_first_update(heap, from);
  }
+  from->count--;

  // insert into `to`
+  to->count++;
  if (enqueue_at_end) {
    // enqueue at the end
    page->prev = to->last;
@ -317,8 +383,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
      page->prev = to->first;
      page->next = next;
      to->first->next = page;
-      if (next != NULL) { 
-        next->prev = page; 
+      if (next != NULL) {
+        next->prev = page;
      }
      else {
        to->last = page;
@ -356,15 +422,10 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
-    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
-    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
-    // that after appending only the new heap will be used for delayed free operations.
-    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    mi_page_set_heap(page, heap);
    count++;
  }
+  mi_assert_internal(count == append->count);

  if (pq->last==NULL) {
    // take over afresh
@ -381,5 +442,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
    append->first->prev = pq->last;
    pq->last = append->last;
  }
+  pq->count += append->count;
+
  return count;
 }
--- a/src/page.c
+++ b/src/page.c
@ -36,14 +36,15 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }

-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+//static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);

 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page) == page);
  size_t count = 0;
  while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
    count++;
    head = mi_block_next(page, head);
  }
@ -59,7 +60,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {

 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
  size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
  mi_block_t* start = (mi_block_t*)page_area;
  mi_block_t* end   = (mi_block_t*)(page_area + psize);
  while(p != NULL) {
@ -83,10 +84,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  mi_assert_internal(page->capacity <= page->reserved);

  // const size_t bsize = mi_page_block_size(page);
-  mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
+  // uint8_t* start = mi_page_start(page);
  //mi_assert_internal(start + page->capacity*page->block_size == page->top);

  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -121,86 +119,38 @@ bool _mi_page_is_valid(mi_page_t* page) {
  #if MI_SECURE
  mi_assert_internal(page->keys[0] != 0);
  #endif
-  if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
-    #if MI_HUGE_PAGE_ABANDON
-    if (segment->page_kind != MI_PAGE_HUGE)
-    #endif
+  if (!mi_page_is_abandoned(page)) {
+    //mi_assert_internal(!_mi_process_is_initialized);
    {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
+      // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
  return true;
 }
 #endif

-void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
-    mi_atomic_yield();
-  }
-}
-
-bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfreex;
-  mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;
-  size_t yield_count = 0;
-  do {
-    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
-    tfreex = mi_tf_set_delayed(tfree, delay);
-    old_delay = mi_tf_delayed(tfree);
-    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
-      if (yield_count >= 4) return false;  // give up after 4 tries
-      yield_count++;
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
-      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
-    }
-    else if (delay == old_delay) {
-      break; // avoid atomic operation if already equal
-    }
-    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never-delayed flag set
-    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  return true; // success
-}

 /* -----------------------------------------------------------
  Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */

-// Collect the local `thread_free` list using an atomic exchange.
-// Note: The exchange must be done atomically as this is used right after
-// moving to the full list in `mi_page_collect_ex` and we need to
-// ensure that there was no race where the page became unfull just before the move.
-static void _mi_page_thread_free_collect(mi_page_t* page)
+static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
-  mi_block_t* head;
-  mi_thread_free_t tfreex;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    head = mi_tf_block(tfree);
-    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
-
-  // return if the list is empty
  if (head == NULL) return;

-  // find the tail -- also to get a proper count (without data races)
+  // find the last block in the list -- also to get a proper use count (without data races)
  size_t max_count = page->capacity; // cannot collect more than capacity
  size_t count = 1;
-  mi_block_t* tail = head;
+  mi_block_t* last = head;
  mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while ((next = mi_block_next(page, last)) != NULL && count <= max_count) {
    count++;
-    tail = next;
+    last = next;
  }
+
  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
  if (count > max_count) {
    _mi_error_message(EFAULT, "corrupted thread-free list\n");
@ -208,20 +158,37 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  }

  // and append the current local free list
-  mi_block_set_next(page,tail, page->local_free);
+  mi_block_set_next(page, last, page->local_free);
  page->local_free = head;

  // update counts now
-  page->used -= (uint16_t)count;
+  mi_assert_internal(count <= UINT16_MAX);
+  page->used = page->used - (uint16_t)count;
+}
+
+// Collect the local `thread_free` list using an atomic exchange.
+static void mi_page_thread_free_collect(mi_page_t* page)
+{
+  // atomically capture the thread free list
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    if mi_likely(head == NULL) return; // return if the list is empty
+    tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
+  mi_assert_internal(head != NULL);
+
+  // and move it to the local list
+  mi_page_thread_collect_to_local(page, head);
 }

 void _mi_page_free_collect(mi_page_t* page, bool force) {
  mi_assert_internal(page!=NULL);

  // collect the thread free list
-  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
-    _mi_page_thread_free_collect(page);
-  }
+  mi_page_thread_free_collect(page);

  // and the local free list
  if (page->local_free != NULL) {
@ -248,50 +215,116 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  mi_assert_internal(!force || page->local_free == NULL);
 }

+// Collect elements in the thread-free list starting at `head`. This is an optimized
+// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`.
+//
+// `head` must be in the `xthread_free` list. It will not collect `head` itself
+// so the `used` count is not fully updated in general. However, if the `head` is
+// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true).
+void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
+  if (head == NULL) return;
+  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops)
+  if (next != NULL) {
+    mi_block_set_next(page, head, NULL);
+    mi_page_thread_collect_to_local(page, next);
+    if (page->local_free != NULL && page->free == NULL) {
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->free_is_zero = false;
+    }
+  }
+  if (page->used == 1) {
+    // all elements are free'd since we skipped the `head` element itself
+    mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head);
+    mi_assert_internal(mi_block_next(page,head) == NULL);
+    _mi_page_free_collect(page, false);  // collect the final element
+  }
+}


 /* -----------------------------------------------------------
  Page fresh and retire
 ----------------------------------------------------------- */

+/*
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  // mi_page_set_heap(page, heap);
+  // _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+
  mi_assert_expensive(mi_page_is_valid_init(page));
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
+  // mi_assert_internal(mi_page_heap(page) == heap);
+  // mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);

  // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  mi_page_queue_push(heap, pq, page);
  mi_assert_expensive(_mi_page_is_valid(page));
 }
+*/
+
+// called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
+void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page)
+{
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+
+  mi_page_set_heap(page,heap);
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_queue_push_at_end(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  if (mi_page_all_free(page)) {
+    _mi_page_free(page, pq);
+  }
+  else {
+    mi_page_queue_remove(pq, page);
+    mi_heap_t* heap = page->heap;
+    mi_page_set_heap(page, NULL);
+    page->heap = heap; // dont set heap to NULL so we can reclaim_on_free within the same heap
+    _mi_arenas_page_abandon(page, heap->tld);
+    _mi_arenas_collect(false, false, heap->tld); // allow purging
+  }
+}
+

 // allocate a fresh page from a segment
 static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
  #if !MI_HUGE_PAGE_ABANDON
  mi_assert_internal(pq != NULL);
  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
  #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
+  mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment);
  if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
    return NULL;
  }
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
-  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
-  // a fresh page was found, initialize it
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
-  mi_assert_internal(full_block_size >= block_size);
-  mi_page_init(heap, page, full_block_size, heap->tld);
+  if (mi_page_is_abandoned(page)) {
+    _mi_heap_page_reclaim(heap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        mi_page_extend_free(heap, page);
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
+  }
+  else if (pq != NULL) {
+    mi_page_queue_push(heap, pq, page);
+  }
  mi_heap_stat_increase(heap, pages, 1);
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
  mi_heap_stat_increase(heap, page_bins[mi_page_bin(page)], 1);
-  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
  mi_assert_expensive(_mi_page_is_valid(page));
  return page;
 }
@ -302,55 +335,21 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
  if (page==NULL) return NULL;
  mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
  return page;
 }

-/* -----------------------------------------------------------
-   Do any delayed frees
-   (put there by other threads if they deallocated in a full page)
----------------------------------------------------------- */
-void _mi_heap_delayed_free_all(mi_heap_t* heap) {
-  while (!_mi_heap_delayed_free_partial(heap)) {
-    mi_atomic_yield();
-  }
-}
-
-// returns true if all delayed frees were processed
-bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
-  bool all_freed = true;
-
-  // and free them all
-  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
-    // use internal free instead of regular one to keep stats etc correct
-    if (!_mi_free_delayed_block(block)) {
-      // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
-      // into the delayed free list
-      all_freed = false;
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-    block = next;
-  }
-  return all_freed;
-}

 /* -----------------------------------------------------------
  Unfull, abandon, free and retire
 ----------------------------------------------------------- */

-// Move a page from the full list back to a regular list
+// Move a page from the full list back to a regular list (called from thread-local mi_free)
 void _mi_page_unfull(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_is_in_full(page));
+  mi_assert_internal(!mi_page_heap(page)->allow_page_abandon);
  if (!mi_page_is_in_full(page)) return;

  mi_heap_t* heap = mi_page_heap(page);
@ -366,87 +365,40 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(!mi_page_immediate_available(page));
  mi_assert_internal(!mi_page_is_in_full(page));

-  if (mi_page_is_in_full(page)) return;
-  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
-  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
-}
-
-
-// Abandon a page with used blocks at the end of a thread.
-// Note: only call if it is ensured that no references exist from
-// the `page->heap->thread_delayed_free` into this page.
-// Currently only called through `mi_heap_collect_ex` which ensures this.
-void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
-  mi_assert_internal(page != NULL);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(mi_page_heap(page) != NULL);
-
-  mi_heap_t* pheap = mi_page_heap(page);
-
-  // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
-  // page is no longer associated with our heap
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_page_set_heap(page, NULL);
-
-#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
-  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
-    mi_assert_internal(_mi_ptr_page(block) != page);
-  }
-#endif
-
-  // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
-}
-
-// force abandon a page
-void _mi_page_force_abandon(mi_page_t* page) {
  mi_heap_t* heap = mi_page_heap(page);
-  // mark page as not using delayed free
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-
-  // ensure this page is no longer in the heap delayed free list
-  _mi_heap_delayed_free_all(heap);
-  // We can still access the page meta-info even if it is freed as we ensure
-  // in `mi_segment_force_abandon` that the segment is not freed (yet)
-  if (page->capacity == 0) return; // it may have been freed now
-
-  // and now unlink it from the page queue and abandon (or free)
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  if (mi_page_all_free(page)) {
-    _mi_page_free(page, pq, false);
-  }
-  else {
+  if (heap->allow_page_abandon) {
+    // abandon full pages (this is the usual case in order to allow for sharing of memory between heaps)
    _mi_page_abandon(page, pq);
  }
+  else if (!mi_page_is_in_full(page)) {
+    // put full pages in a heap local queue (this is for heaps that cannot abandon, for example, if the heap can be destroyed)
+    mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
+    _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+  }
 }

+
 // Free a page with no more free blocks
-void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+  // mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);

  // no more aligned blocks in here
  mi_page_set_has_aligned(page, false);

  // remove from the page list
  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_segments_tld_t* segments_tld = &heap->tld->segments;
  mi_page_queue_remove(pq, page);

  // and free it
+  mi_heap_t* heap = page->heap;
  mi_heap_stat_decrease(heap, page_bins[mi_page_bin(page)], 1);
  mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  _mi_arenas_page_free(page);
+  _mi_arenas_collect(false, false, heap->tld);  // allow purging
 }

 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@ -476,9 +428,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
  const size_t bsize = mi_page_block_size(page);
  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
    if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.pages_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_heap_t* heap = mi_page_heap(page);
+      mi_debug_heap_stat_counter_increase(heap, pages_retire, 1);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
      mi_assert_internal(pq >= heap->pages);
      const size_t index = pq - heap->pages;
      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@ -489,7 +441,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
    }
  }
  #endif
-  _mi_page_free(page, pq, false);
+  _mi_page_free(page, pq);
 }

 // free retired pages: we don't need to look at the entire queues
@ -504,7 +456,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
      if (mi_page_all_free(page)) {
        page->retire_expire--;
        if (force || page->retire_expire == 0) {
-          _mi_page_free(pq->first, pq, force);
+          _mi_page_free(pq->first, pq);
        }
        else {
          // keep retired, update min/max
@ -521,6 +473,29 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
  heap->page_retired_max = max;
 }

+/*
+static void mi_heap_collect_full_pages(mi_heap_t* heap) {
+  // note: normally full pages get immediately abandoned and the full queue is always empty
+  // this path is only used if abandoning is disabled due to a destroy-able heap or options
+  // set by the user.
+  mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL];
+  for (mi_page_t* page = pq->first; page != NULL; ) {
+    mi_page_t* next = page->next;         // get next in case we free the page
+    _mi_page_free_collect(page, false);   // register concurrent free's
+    // no longer full?
+    if (!mi_page_is_full(page)) {
+      if (mi_page_all_free(page)) {
+        _mi_page_free(page, pq);
+      }
+      else {
+        _mi_page_unfull(page);
+      }
+    }
+    page = next;
+  }
+}
+*/
+

 /* -----------------------------------------------------------
  Initialize the initial free list in a page.
@ -534,7 +509,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {

 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
  MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -592,7 +567,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
  MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
@ -620,7 +595,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */

 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@ -631,9 +606,9 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
  mi_assert(page->free == NULL);
  mi_assert(page->local_free == NULL);
  if (page->free != NULL) return;
@ -642,12 +617,12 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)

  size_t page_size;
  //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  mi_page_area(page, &page_size);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);

  // calculate the extend count
  const size_t bsize = mi_page_block_size(page);
-  size_t extend = page->reserved - page->capacity;
+  size_t extend = (size_t)page->reserved - page->capacity;
  mi_assert_internal(extend > 0);

  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
@ -663,56 +638,56 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
  mi_assert_internal(extend < (1UL<<16));

+  // commit on demand?
+  if (page->slice_committed > 0) {
+    const size_t needed_size = (page->capacity + extend)*bsize;
+    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
+    if (needed_commit > page->slice_committed) {
+      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
+      _mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL);
+      page->slice_committed = needed_commit;
+    }
+  }
+
  // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
  }
  else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats);
  }
  // enable the new free list
  page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
  mi_assert_expensive(mi_page_is_valid_init(page));
 }

-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
  mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
  mi_page_set_heap(page, heap);
-  page->block_size = block_size;
+
  size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
-  mi_track_mem_noaccess(page->page_start,page_size);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
  mi_assert_internal(page->reserved > 0);
  #if (MI_PADDING || MI_ENCODE_FREELIST)
  page->keys[0] = _mi_heap_random_next(heap);
  page->keys[1] = _mi_heap_random_next(heap);
  #endif
-  page->free_is_zero = page->is_zero_init;
  #if MI_DEBUG>2
-  if (page->is_zero_init) {
-    mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+  if (page->memid.initially_zero) {
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
  }
  #endif
-  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
-  }
-  else {
-    page->block_size_shift = 0;
-  }

  mi_assert_internal(page->capacity == 0);
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(page->xthread_free == 1);
  mi_assert_internal(page->next == NULL);
  mi_assert_internal(page->prev == NULL);
  mi_assert_internal(page->retire_expire == 0);
@ -721,11 +696,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->keys[0] != 0);
  mi_assert_internal(page->keys[1] != 0);
  #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
  mi_assert_expensive(mi_page_is_valid_init(page));

  // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  mi_page_extend_free(heap,page);
  mi_assert(mi_page_immediate_available(page));
 }

@ -734,70 +709,65 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  Find pages with free blocks
 -------------------------------------------------------------*/

-// search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (4)
-
-// is the page not yet used up to its reserved space?
-static bool mi_page_is_expandable(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(page->capacity <= page->reserved);
-  return (page->capacity < page->reserved);
-}
-
-
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
  // search through the pages in "next fit" order
-  #if MI_STAT
  size_t count = 0;
-  #endif
-  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
+  long page_full_retain = (pq->block_size > MI_SMALL_MAX_OBJ_SIZE ? 0 : heap->page_full_retain); // only retain small pages
  mi_page_t* page_candidate = NULL;  // a page with free space
  mi_page_t* page = pq->first;

  while (page != NULL)
  {
-    mi_page_t* next = page->next; // remember next
-    #if MI_STAT
+    mi_page_t* next = page->next; // remember next (as this page can move to another queue)
    count++;
-    #endif
-    candidate_count++;
+    candidate_limit--;

-    // collect freed blocks by us and other threads
-    _mi_page_free_collect(page, false);
-
-  #if MI_MAX_CANDIDATE_SEARCH > 1
    // search up to N pages for a best candidate

    // is the local free list non-empty?
-    const bool immediate_available = mi_page_immediate_available(page);
+    bool immediate_available = mi_page_immediate_available(page);
+    if (!immediate_available) {
+      // collect freed blocks by us and other threads to we get a proper use count
+      _mi_page_free_collect(page, false);
+      immediate_available = mi_page_immediate_available(page);
+    }

    // if the page is completely full, move it to the `mi_pages_full`
    // queue so we don't visit long-lived pages too often.
    if (!immediate_available && !mi_page_is_expandable(page)) {
-      mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
-      mi_page_to_full(page, pq);
+      page_full_retain--;
+      if (page_full_retain < 0) {
+        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+        mi_page_to_full(page, pq);
+      }
    }
    else {
      // the page has free space, make it a candidate
      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
      if (page_candidate == NULL) {
        page_candidate = page;
-        candidate_count = 0;
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
+      }
+      else if (mi_page_all_free(page_candidate)) {
+        _mi_page_free(page_candidate, pq);
+        page_candidate = page;
      }
      // prefer to reuse fuller pages (in the hope the less used page gets freed)
-      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
        page_candidate = page;
      }
      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
-      if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) {
+      if (immediate_available || candidate_limit <= 0) {
        mi_assert_internal(page_candidate!=NULL);
        break;
      }
    }
-  #else
-    // first-fit algorithm
+
+  #if 0
+    // first-fit algorithm without candidates
    // If the page contains free blocks, we are done
    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
      break;  // pick this one
@ -818,20 +788,26 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
  if (page_candidate != NULL) {
    page = page_candidate;
  }
-  if (page != NULL && !mi_page_immediate_available(page)) {
-    mi_assert_internal(mi_page_is_expandable(page));
-    mi_page_extend_free(heap, page, heap->tld);
+  if (page != NULL) {
+    if (!mi_page_immediate_available(page)) {
+      mi_assert_internal(mi_page_is_expandable(page));
+      mi_page_extend_free(heap, page);
+    }
+    mi_assert_internal(mi_page_immediate_available(page));
  }

  if (page == NULL) {
    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    if (page == NULL && first_try) {
      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
      page = mi_page_queue_find_free_ex(heap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    }
  }
  else {
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    // move the page to the front of the queue
    mi_page_queue_move_to_front(heap, pq, page);
    page->retire_expire = 0;
@ -846,30 +822,25 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p


 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap, size);
+static mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
+  // mi_page_queue_t* pq = mi_page_queue(heap, size);
+  mi_assert_internal(!mi_page_queue_is_huge(pq));

  // check the first page: we even do this with candidate search or otherwise we re-search every time
  mi_page_t* page = pq->first;
-  if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+  if mi_likely(page != NULL && mi_page_immediate_available(page)) {
+    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+      mi_page_extend_free(heap, page);
      mi_assert_internal(mi_page_immediate_available(page));
    }
-    else
-   #endif
-    {
-      _mi_page_free_collect(page,false);
-    }
-
-    if (mi_page_immediate_available(page)) {
-      page->retire_expire = 0;
-      return page; // fast path
-    }
+    #endif
+    page->retire_expire = 0;
+    return page; // fast path
+  }
+  else {
+    return mi_page_queue_find_free_ex(heap, pq, true);
  }
-
-  return mi_page_queue_find_free_ex(heap, pq, true);
 }


@ -905,13 +876,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
-  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
+  const size_t block_size = _mi_os_good_alloc_size(size);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
  #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = NULL;
+  #error todo.
  #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  // mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
  mi_assert_internal(mi_page_queue_is_huge(pq));
  #endif
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@ -919,10 +890,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
    mi_assert_internal(mi_page_block_size(page) >= size);
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(mi_page_is_huge(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
-    mi_assert_internal(_mi_page_segment(page)->used==1);
+    mi_assert_internal(mi_page_is_singleton(page));
    #if MI_HUGE_PAGE_ABANDON
-    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_assert_internal(mi_page_is_abandoned(page));
    mi_page_set_heap(page, NULL);
    #endif
    mi_heap_stat_increase(heap, malloc_huge, mi_page_block_size(page));
@ -935,30 +905,30 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
-  // huge allocation?
  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
-    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      return NULL;
-    }
-    else {
-      return mi_huge_page_alloc(heap,size,huge_alignment);
-    }
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
+  }
+  mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
+  // huge allocation?
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
+    return mi_huge_page_alloc(heap,size,huge_alignment,pq);
  }
  else {
    // otherwise find a page with free blocks in our size segregated queues
    #if MI_PADDING
    mi_assert_internal(size >= MI_PADDING_SIZE);
    #endif
-    return mi_find_free_page(heap, size);
+    return mi_find_free_page(heap, pq);
  }
 }

+
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
-// very large requested alignments in which case we use a huge segment.
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
+// very large requested alignments in which case we use a huge singleton page.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
  mi_assert_internal(heap != NULL);
@ -971,17 +941,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
  mi_assert_internal(mi_heap_is_initialized(heap));

  // do administrative tasks every N generic mallocs
-  if mi_unlikely(++heap->generic_count >= 100) {
+  if mi_unlikely(++heap->generic_count >= 1000) {
    heap->generic_collect_count += heap->generic_count;
    heap->generic_count = 0;
    // call potential deferred free routines
    _mi_deferred_free(heap, false);

-    // free delayed frees from other threads (but skip contended ones)
-    _mi_heap_delayed_free_partial(heap);
-    
    // collect every once in a while (10000 by default)
-    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);    
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
    if (heap->generic_collect_count >= generic_collect) {
      heap->generic_collect_count = 0;
      mi_heap_collect(heap, false /* force? */);
@ -991,7 +958,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
  // find (or allocate) a page of the right size
  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
-    mi_heap_collect(heap, true /* force */);
+    mi_heap_collect(heap, true /* force? */);
    page = mi_find_page(heap, size, huge_alignment);
  }

@ -1003,6 +970,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al

  mi_assert_internal(mi_page_immediate_available(page));
  mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(page)==page);

  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
  void* p;
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@ -239,6 +239,9 @@ void _mi_prim_thread_done_auto_done(void) {

 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
-
 }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/osx/alloc-override-zone.c
+++ b/src/prim/osx/alloc-override-zone.c
@ -64,7 +64,8 @@ static void* zone_valloc(malloc_zone_t* zone, size_t size) {

 static void zone_free(malloc_zone_t* zone, void* p) {
  MI_UNUSED(zone);
-  mi_cfree(p);
+  // mi_cfree(p);  // checked free as `zone_free` may be called with invalid pointers
+  mi_free(p); // with the page_map and pagemap_commit=1 we can use the regular free
 }

 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -62,6 +62,7 @@ terms of the MIT license. A copy of the license can be found in the file
  #include <sys/syscall.h>
 #endif

+#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?

 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
@ -148,7 +149,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
    }
    #endif
  }
-  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
  config->has_overcommit = unix_detect_overcommit();
  config->has_partial_free = true;    // mmap can free in parts
  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@ -201,7 +202,8 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
  void* p = NULL;
  #if defined(MAP_ALIGNED)  // BSD
  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    size_t n = mi_bsr(try_alignment);
+    size_t idx;
+    size_t n = mi_bsr(try_alignment, &idx);
    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
@ -365,6 +367,9 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);
+  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
+    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
+  }

  *is_zero = true;
  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@ -412,7 +417,7 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
  int err = 0;
  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
  err = unix_madvise(start, size, MADV_DONTNEED);
-  #if !MI_DEBUG && !MI_SECURE
+  #if !MI_DEBUG && MI_SECURE<=2
    *needs_recommit = false;
  #else
    *needs_recommit = true;
@ -482,7 +487,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co
 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
  bool is_large = true;
  *is_zero = true;
-  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
    unsigned long numa_mask = (1UL << numa_node);
    // TODO: does `mbind` work correctly for huge OS pages? should we
@ -889,3 +894,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }

 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@ -277,3 +277,7 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
  MI_UNUSED(heap);
 }
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -17,6 +17,14 @@ terms of the MIT license. A copy of the license can be found in the file
 // Dynamically bind Windows API points for portability
 //---------------------------------------------

+#if defined(_MSC_VER)
+#pragma warning(disable:28159)  // don't use GetVersion
+#pragma warning(disable:4996)   // don't use GetVersion
+#endif
+
+static DWORD win_major_version = 6;
+static DWORD win_minor_version = 0;
+
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
@ -111,16 +119,27 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 // Initialize
 //---------------------------------------------

+static DWORD win_allocation_granularity = 64*MI_KiB;
+
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
  config->has_overcommit = false;
  config->has_partial_free = false;
  config->has_virtual_reserve = true;
+  // windows version
+  OSVERSIONINFOW version; _mi_memzero_var(version);
+  if (GetVersionExW(&version)) {
+    win_major_version = version.dwMajorVersion;
+    win_minor_version = version.dwMinorVersion;
+  }
  // get the page size
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  if (si.dwAllocationGranularity > 0) {
+    config->alloc_granularity = si.dwAllocationGranularity;
+    win_allocation_granularity = si.dwAllocationGranularity;
+  }
  // get virtual address bits
  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
    const size_t vbits = MI_SIZE_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
@ -183,7 +202,7 @@ int _mi_prim_free(void* addr, size_t size ) {
    // the start of the region.
    MEMORY_BASIC_INFORMATION info; _mi_memzero_var(info);
    VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
      errcode = 0;
      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
      if (err) { errcode = GetLastError(); }
@ -211,7 +230,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
  }
  #endif
  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@ -247,7 +266,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
      // success, return the address
      return p;
    }
-    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
              (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
              win_is_out_of_memory_error(GetLastError())) {
      // if committing regular memory and being out-of-memory,
@ -823,3 +842,16 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
    mi_allocator_done();
  }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  #if (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64)
+  if (win_major_version >= 6) {
+    // check if this thread belongs to a windows threadpool
+    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
+    _TEB* const teb = NtCurrentTeb();
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
+    return (pool_data != NULL);
+  }
+  #endif
+  return false;
+}
--- a/src/prim/windows/windbg/mimalloc_dbg.cpp
+++ b/src/prim/windows/windbg/mimalloc_dbg.cpp
@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) Microsoft Research
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include <atomic>
+#include <DbgEng.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <Windows.h>
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+
+ULONG64 g_MiMallocBase = 0;
+IDebugClient* g_DebugClient = nullptr;
+IDebugControl* g_DebugControl = nullptr;
+IDebugSymbols3* g_DebugSymbols = nullptr;
+IDebugDataSpaces* g_DataSpaces = nullptr;
+
+// Function to find mimalloc.dll base address at startup
+HRESULT FindMimallocBase()
+{
+    if (g_DebugSymbols == nullptr)
+    {
+        return E_FAIL;
+    }
+
+    return g_DebugSymbols->GetModuleByModuleName("mimalloc", 0, NULL, &g_MiMallocBase);
+}
+
+// Entry point for the extension
+extern "C" __declspec(dllexport) HRESULT CALLBACK DebugExtensionInitialize(PULONG version, PULONG flags)
+{
+    UNREFERENCED_PARAMETER(flags);
+
+    // Ensure Version is valid
+    if (!version)
+    {
+        return E_INVALIDARG;
+    }
+
+    // Set the version
+    *version = DEBUG_EXTENSION_VERSION(1, 0);
+
+    HRESULT hr = DebugCreate(__uuidof(IDebugClient), (void**)&g_DebugClient);
+    if (FAILED(hr))
+    {
+        return hr;
+    }
+
+    // Query for the IDebugControl interface
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugControl), (void**)&g_DebugControl);
+    if (FAILED(hr))
+    {
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugSymbols3), (void**)&g_DebugSymbols);
+    if (FAILED(hr))
+    {
+        g_DebugControl->Release();
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    hr = g_DebugClient->QueryInterface(__uuidof(IDebugDataSpaces), (void**)&g_DataSpaces);
+    if (FAILED(hr))
+    {
+        g_DebugSymbols->Release();
+        g_DebugControl->Release();
+        g_DebugClient->Release();
+
+        return hr;
+    }
+
+    // Find mimalloc base address at startup
+    hr = FindMimallocBase();
+    if (FAILED(hr) || g_MiMallocBase == 0)
+    {
+        return E_FAIL;  // Prevent extension from loading
+    }
+
+    mi_register_output(
+        [](const char* msg, void* arg) {
+            g_DebugControl->Output(DEBUG_OUTPUT_ERROR, msg);
+            g_DebugControl->Output(DEBUG_OUTPUT_ERROR, "\n");
+        },
+        nullptr);
+
+    g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "mimalloc.dll base address found: 0x%llx\n", g_MiMallocBase);
+
+    return S_OK;
+}
+
+// Notifies the extension that a debug event has occurred
+extern "C" __declspec(dllexport) void CALLBACK DebugExtensionNotify(ULONG notify, ULONG64 argument)
+{
+    UNREFERENCED_PARAMETER(notify);
+    UNREFERENCED_PARAMETER(argument);
+}
+
+// Uninitializes the extension
+extern "C" __declspec(dllexport) void CALLBACK DebugExtensionUninitialize()
+{
+    if (g_DebugSymbols)
+    {
+        g_DebugSymbols->Release();
+        g_DebugSymbols = nullptr;
+    }
+
+    if (g_DebugControl)
+    {
+        g_DebugControl->Release();
+        g_DebugControl = nullptr;
+    }
+
+    if (g_DebugClient)
+    {
+        g_DebugClient->Release();
+        g_DebugClient = nullptr;
+    }
+}
+
+// Sample command: !mi_help
+extern "C" __declspec(dllexport) HRESULT CALLBACK mi_help(PDEBUG_CLIENT Client, PCSTR args)
+{
+    UNREFERENCED_PARAMETER(args);
+
+    // Print Help
+    g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "Hello from MiMalloc WinDbg Extension!\n");
+
+    return S_OK;
+}
+
+extern "C" __declspec(dllexport) HRESULT CALLBACK mi_dump_arenas(PDEBUG_CLIENT client, PCSTR args)
+{
+    mi_debug_show_arenas();
+    return S_OK;
+}
--- a/src/random.c
+++ b/src/random.c
@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
-#include <string.h>       // memset

 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/

-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }

 static void chacha_block(mi_random_ctx_t* ctx)
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
  for (size_t i = 0; i < 4; i++) {
    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
    ctx->input[i] = read32(sigma,i);
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }

 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
  ctx_new->input[12] = 0;
  ctx_new->input[13] = 0;
@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim

 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
  // and do a few randomization steps
  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
  for (uintptr_t i = 0; i < max; i++) {
--- a/src/segment-map.c
+++ b/src/segment-map.c
@ -1,142 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
----------------------------------------------------------- */
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-// Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
-#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
-#elif (MI_INTPTR_SIZE > 4)
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
-#else
-#define MI_SEGMENT_MAP_MAX_ADDRESS    (UINT32_MAX)
-#endif
-
-#define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
-#define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
-#define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
-#define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)                 // memory area covered by 1 bit
-
-#if (MI_SEGMENT_MAP_PART_BITS < (MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_BIT_SPAN)) // prevent overflow on 32-bit (issue #1017)
-#define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
-#else
-#define MI_SEGMENT_MAP_PART_SPAN      MI_SEGMENT_MAP_MAX_ADDRESS
-#endif
-
-#define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
-
-// A part of the segment map.
-typedef struct mi_segmap_part_s {
-  mi_memid_t memid;
-  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
-} mi_segmap_part_t;
-
-// Allocate parts on-demand to reduce .bss footprint
-static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
-
-static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
-  // note: segment can be invalid or NULL.
-  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  *idx = 0;
-  *bitidx = 0;  
-  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
-  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
-  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
-  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
-
-  // allocate on demand to reduce .bss footprint
-  if mi_unlikely(part == NULL) {
-    if (!create_on_demand) return NULL;
-    mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
-    if (part == NULL) return NULL;
-    part->memid = memid;
-    mi_segmap_part_t* expected = NULL;
-    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
-      _mi_os_free(part, sizeof(mi_segmap_part_t), memid);
-      part = expected;
-      if (part == NULL) return NULL;
-    }
-  }
-  mi_assert(part != NULL);
-  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
-  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
-  *idx = bitofs / MI_INTPTR_BITS;
-  *bitidx = bitofs % MI_INTPTR_BITS;
-  return part;
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  if (segment->memid.memkind == MI_MEM_ARENA) return;
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
-  if (part == NULL) return; // outside our address range..
-  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
-  size_t index;
-  size_t bitidx;
-  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
-  if (part == NULL) return NULL;  
-  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
-  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
-    return segment; // yes, allocated by us
-  }
-  return NULL;
-}
-
-// Is this a valid pointer in our heap?
-static bool mi_is_valid_pointer(const void* p) {
-  // first check if it is in an arena, then check if it is OS allocated
-  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
-
-void _mi_segment_map_unsafe_destroy(void) {
-  for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) {
-    mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL);
-    if (part != NULL) {
-      _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid);
-    }
-  }
-}
--- a/src/segment.c
+++ b/src/segment.c
--- a/src/static.c
+++ b/src/static.c
@ -20,10 +20,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
+#include "arena-meta.c"
 #include "bitmap.c"
 #include "heap.c"
 #include "init.c"
@ -31,9 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
+#include "page-map.c"
 #include "random.c"
-#include "segment.c"
-#include "segment-map.c"
 #include "stats.c"
 #include "prim/prim.c"
 #if MI_OSX_ZONE
--- a/src/stats.c
+++ b/src/stats.c
@ -19,48 +19,77 @@ terms of the MIT license. A copy of the license can be found in the file
  Statistics operations
 ----------------------------------------------------------- */

-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->total, amount);
+  }
 }

 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->total,amount);
-    }
-  }
-  else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) { stat->peak = stat->current; }
-    if (amount > 0) { stat->total += amount; }
-  }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) { stat->peak = stat->current; }
+  if (amount > 0) { stat->total += amount; }
 }

-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->total += amount;
-  }
+
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
 }

-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->total += amount;
+}
+
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, (int64_t)amount);
 }

-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
  mi_stat_update(stat, -((int64_t)amount));
 }


+// Adjust stats to compensate; for example before committing a range,
+// first adjust downwards with parts that were already committed so
+// we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // adjust atomically
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed(&stat->total, amount);
+}
+
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  stat->current += amount;
+  stat->total += amount;
+}
+
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, -((int64_t)amount));
+}
+

 // must be thread safe as it is called from stats_merge
 static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) {
@ -94,8 +123,8 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  }
  #endif
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]);    
-  }  
+    mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]);
+  }
 }

 #undef MI_STAT_COUNT
@ -109,7 +138,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;
+  char buf[32]; _mi_memzero_var(buf);
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "B");
  const int64_t base = (unit == 0 ? 1000 : 1024);
@ -274,7 +303,7 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {

 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
  // wrap the output function to be line buffered
-  char buf[256];
+  char buf[256]; _mi_memzero_var(buf);
  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
  buffer.buf = buf;
  mi_output_fun* out = &mi_buffered_out;
@ -302,15 +331,20 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
  mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  // mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  // mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  // mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
  mi_stat_print(&stats->pages, "pages", -1, out, arg);
  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "-reclaima", out, arg);
+  mi_stat_counter_print(&stats->pages_reclaim_on_free,  "-reclaimf", out, arg);
+  mi_stat_counter_print(&stats->pages_reabandon_full, "-reabandon", out, arg);
+  mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "-waits", out, arg);
  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
  mi_stat_counter_print(&stats->pages_retire, "-retire", out, arg);
  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
  // mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
+  // mi_stat_counter_print(&stats->arena_purges, "-purges", out, arg);
  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
@ -343,36 +377,37 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)

 static mi_msecs_t mi_process_start; // = 0

-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
-}
-
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
-  }
+// return thread local stats
+static mi_stats_t* mi_get_tld_stats(void) {
+  return &mi_heap_get_default()->tld->stats;
 }

 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_stats_t* stats = mi_get_tld_stats();
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
+  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }

-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) {
+  if (to != from) {
+    mi_stats_add(to, from);
+    _mi_memzero(from, sizeof(mi_stats_t));
+  }
 }

 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+  _mi_stats_merge_from(&_mi_subproc()->stats, stats);
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  _mi_stats_done( mi_get_tld_stats() );
 }

 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_stats_merge();
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }

 void mi_stats_print(void* out) mi_attr_noexcept {
@ -381,7 +416,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }

 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }


@ -415,11 +450,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {

 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc();
  mi_process_info_t pinfo;
  _mi_memzero_var(pinfo);
  pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
  pinfo.current_rss    = pinfo.current_commit;
  pinfo.peak_rss       = pinfo.peak_commit;
  pinfo.utime          = 0;
@ -447,7 +483,7 @@ void mi_stats_get(size_t stats_size, mi_stats_t* stats) mi_attr_noexcept {
  if (stats == NULL || stats_size == 0) return;
  _mi_memzero(stats, stats_size);
  const size_t size = (stats_size > sizeof(mi_stats_t) ? sizeof(mi_stats_t) : stats_size);
-  _mi_memcpy(stats, &_mi_stats_main, size);
+  _mi_memcpy(stats, &_mi_subproc()->stats, size);
  stats->version = MI_STAT_VERSION;
 }

@ -494,9 +530,9 @@ static void mi_heap_buf_print(mi_heap_buf_t* hbuf, const char* msg) {

 static void mi_heap_buf_print_count_bin(mi_heap_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, size_t bin, bool add_comma) {
  const size_t binsize = _mi_bin_size(bin);
-  const size_t pagesize = (binsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_SMALL_PAGE_SIZE :
-                            (binsize <= MI_MEDIUM_OBJ_SIZE_MAX ? MI_MEDIUM_PAGE_SIZE :
-                              (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0)));
+  const size_t pagesize = (binsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_SMALL_PAGE_SIZE :
+                            (binsize <= MI_MEDIUM_MAX_OBJ_SIZE ? MI_MEDIUM_PAGE_SIZE :
+                              (binsize <= MI_LARGE_MAX_OBJ_SIZE ? MI_LARGE_PAGE_SIZE : 0)));
  char buf[128];
  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : ""));
  buf[127] = 0;
@ -576,7 +612,7 @@ char* mi_stats_get_json(size_t output_size, char* output_buf) mi_attr_noexcept {
  mi_heap_buf_print(&hbuf, "  },\n");

  // statistics
-  mi_stats_t* stats = &_mi_stats_main;
+  mi_stats_t* stats = &_mi_subproc()->stats;
  MI_STAT_FIELDS()

  // size bins
@ -589,7 +625,7 @@ char* mi_stats_get_json(size_t output_size, char* output_buf) mi_attr_noexcept {
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
    mi_heap_buf_print_count_bin(&hbuf, "    ", &stats->page_bins[i], i, i!=MI_BIN_HUGE);
  }
-  mi_heap_buf_print(&hbuf, "  ]\n");  
+  mi_heap_buf_print(&hbuf, "  ]\n");
  mi_heap_buf_print(&hbuf, "}\n");
  return hbuf.buf;
 }
--- a/test/main-override-dep.cpp
+++ b/test/main-override-dep.cpp
@ -12,4 +12,14 @@ std::string TestAllocInDll::GetString()
 	std::string r = test;
 	delete[] test;
 	return r;
+}
+
+#include <windows.h>
+
+void TestAllocInDll::TestHeapAlloc()
+{
+	HANDLE heap = GetProcessHeap();
+	int* p = (int*)HeapAlloc(heap, 0, sizeof(int));
+	*p = 42;
+	HeapFree(heap, 0, p);
 }
--- a/test/main-override-dep.h
+++ b/test/main-override-dep.h
@ -8,4 +8,5 @@ class TestAllocInDll
 {
 public:
 	__declspec(dllexport) std::string GetString();
+	__declspec(dllexport) void TestHeapAlloc();
 };
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -32,7 +32,7 @@ static void test_manage_os_memory(void);
 int main() {
  mi_version();
  mi_stats_reset();
-  
+
  // mi_bins();

  // test_manage_os_memory();
@ -84,7 +84,7 @@ int main() {

 static void invalid_free() {
  free((void*)0xBADBEEF);
-  realloc((void*)0xBADBEEF,10);
+  realloc((void*)0xBADBEEF, 10);
 }

 static void block_overflow1() {
@ -182,7 +182,7 @@ static void test_process_info(void) {
  size_t peak_commit = 0;
  size_t page_faults = 0;
  for (int i = 0; i < 100000; i++) {
-    void* p = calloc(100,10);
+    void* p = calloc(100, 10);
    free(p);
  }
  mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
@ -193,7 +193,7 @@ static void test_reserved(void) {
 #define KiB 1024ULL
 #define MiB (KiB*KiB)
 #define GiB (MiB*KiB)
-  mi_reserve_os_memory(4*GiB, false, true);
+  mi_reserve_os_memory(3*GiB, false, true);
  void* p1 = malloc(100);
  void* p2 = malloc(100000);
  void* p3 = malloc(2*GiB);
@ -240,8 +240,8 @@ static void test_heap_walk(void) {
 }

 static void test_canary_leak(void) {
-  char* p = mi_mallocn_tp(char,23);
-  for(int i = 0; i < 23; i++) {
+  char* p = mi_mallocn_tp(char, 22);
+  for (int i = 0; i < 22; i++) {
    p[i] = '0'+i;
  }
  puts(p);
@ -251,7 +251,7 @@ static void test_canary_leak(void) {
 #if _WIN32
 static void test_manage_os_memory(void) {
  size_t size = 256 * 1024 * 1024;
-  void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); 
+  void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
  mi_arena_id_t arena_id;
  mi_manage_os_memory_ex(ptr, size, true /* committed */, true /* pinned */, false /* is zero */, -1 /* numa node */, true /* exclusive */, &arena_id);
  mi_heap_t* cuda_heap = mi_heap_new_in_arena(arena_id);    // you can do this in any thread
@ -260,11 +260,11 @@ static void test_manage_os_memory(void) {
  void* p1 = mi_heap_malloc(cuda_heap, 8);
  int* p2 = mi_heap_malloc_tp(cuda_heap, int);
  *p2 = 42;
-  
+
  // and maybe set the cuda heap as the default heap? (but careful as now `malloc` will allocate in the cuda heap as well)
  {
    mi_heap_t* prev_default_heap = mi_heap_set_default(cuda_heap);
-    void* p3 = mi_malloc(8);  // allocate in the cuda heap 
+    void* p3 = mi_malloc(8);  // allocate in the cuda heap
    mi_free(p3);
  }
  mi_free(p1);
@ -287,15 +287,15 @@ static void test_manage_os_memory(void) {
 static void test_large_pages(void) {
  mi_memid_t memid;

-  #if 0
+#if 0
  size_t pages_reserved;
  size_t page_size;
  uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
  const size_t req_size = pages_reserved * page_size;
-  #else
+#else
  const size_t req_size = 64*MI_MiB;
-  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL);
-  #endif
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL);
+#endif

  p[0] = 1;

@ -318,8 +318,8 @@ static void test_large_pages(void) {
 #if 0
 #include <stdint.h>
 #include <stdbool.h>
+#include <mimalloc/bits.h>

-#define MI_INTPTR_SIZE 8
 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)

 #define MI_BIN_HUGE 100
@ -371,8 +371,6 @@ uint8_t _mi_bsr(uintptr_t x) {
  #endif
 }

-
-
 static inline size_t _mi_wsize_from_size(size_t size) {
  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }
@ -412,7 +410,9 @@ static inline size_t mi_bin(size_t wsize) {
 #endif
    wsize--;
    // find the highest bit
-    const size_t b = _mi_bsr(wsize);  // note: wsize != 0
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -446,7 +446,9 @@ static inline uint8_t _mi_bin4(size_t size) {
    bin = MI_BIN_HUGE;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
  }
  return bin;
@ -462,7 +464,9 @@ static size_t _mi_binx4(size_t wsize) {
    bin = (uint8_t)wsize;
  }
  else {
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    size_t idx;
+    mi_bsr(wsize, &idx);
+    uint8_t b = (uint8_t)idx;
    if (b <= 1) return wsize;
    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
  }
@ -471,7 +475,9 @@ static size_t _mi_binx4(size_t wsize) {

 static size_t _mi_binx8(size_t bsize) {
  if (bsize<=1) return bsize;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
+  size_t idx;
+  mi_bsr(bsize, &idx);
+  uint8_t b = (uint8_t)idx;
  if (b <= 2) return bsize;
  size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
  return bin;
@ -489,8 +495,10 @@ static inline size_t mi_binx(size_t wsize) {
  }
  else {
    wsize--;
+    assert(wsize>0);
    // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr32((uint32_t)wsize);  // note: wsize != 0
+    uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize));
+
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -37,7 +37,7 @@ static void test_thread_local();      // issue #944
 static void test_mixed1();             // issue #942
 static void test_stl_allocators();

-#if x_WIN32
+#if _WIN32
 #include "main-override-dep.h"
 static void test_dep();               // issue #981: test overriding in another DLL
 #else
@ -46,8 +46,8 @@ static void test_dep() { };

 int main() {
  mi_stats_reset();  // ignore earlier allocations
-  various_tests();
-  test_mixed1();
+  //various_tests();
+  //test_mixed1();

  test_dep();

@ -145,11 +145,13 @@ static bool test_stl_allocator1() {
 struct some_struct { int i; int j; double z; };


-#if x_WIN32
+#if _WIN32
 static void test_dep()
 {
  TestAllocInDll t;
  std::string s = t.GetString();
+
+  t.TestHeapAlloc();
 }
 #endif

--- a/test/test-api.c
+++ b/test/test-api.c
@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)

 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN

 #include "testhelper.h"

@ -169,7 +169,7 @@ int main(void) {
  /*
  CHECK_BODY("malloc-aligned6") {
    bool ok = true;
-    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
      void* ps[8];
      for (int i = 0; i < 8 && ok; i++) {
        ps[i] = mi_malloc_aligned(align*13  // size
@ -186,16 +186,16 @@ int main(void) {
  };
  */
  CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
    mi_free(p);
-    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
  };
  CHECK_BODY("malloc-aligned8") {
    bool ok = true;
    for (int i = 0; i < 5 && ok; i++) {
      int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
      mi_free(p);
    }
    result = ok;
@ -203,7 +203,7 @@ int main(void) {
  CHECK_BODY("malloc-aligned9") { // test large alignments
    bool ok = true;
    void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
    for (int i = 0; i < 28 && ok; i++) {
      int align = (1 << i);
      for (int j = 0; j < 8 && ok; j++) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -40,6 +40,19 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
+#elif 0
+static int THREADS = 4;
+static int SCALE   = 10;
+static int ITER    = 20;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 50;
+static int ITER    = 50;
+#elif 0
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
+#define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 50;      // scaling factor
@ -50,7 +63,12 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a

 #define STRESS                // undefine for leak test

-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+#ifndef ALLOW_LARGE
+#define ALLOW_LARGE  false
+#endif
+
+static bool   allow_large_objects = ALLOW_LARGE;    // allow very large objects? (set to `true` if SCALE>100)
+
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?

 static bool   main_participates = false;       // main thread participates as a worker too
@ -67,7 +85,7 @@ static bool   main_participates = false;       // main thread participates as a
 #define custom_free(p)        mi_free(p)

 #ifndef NDEBUG
-#define HEAP_WALK             // walk the heap objects?
+#define xHEAP_WALK             // walk the heap objects?
 #endif
 #endif

@ -242,9 +260,23 @@ static void test_stress(void) {
    //mi_debug_show_arenas(true);
    #endif
    #if !defined(NDEBUG) || defined(MI_TSAN)
-    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
+    if ((n + 1) % 10 == 0) {
+      printf("- iterations left: %3d\n", ITER - (n + 1));
+      #ifndef USE_STD_MALLOC
+      mi_debug_show_arenas();
+      #endif
+      //mi_collect(true);
+      //mi_debug_show_arenas();
+    }
    #endif
  }
+  // clean up
+  for (int i = 0; i < TRANSFERS; i++) {
+    void* p = atomic_exchange_ptr(&transfer[i], NULL);
+    if (p != NULL) {
+      free_items(p);
+    }
+  }
 }

 #ifndef STRESS
@ -284,7 +316,12 @@ int main(int argc, char** argv) {
    mi_option_enable(mi_option_visit_abandoned);
  #endif
  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+    // mi_option_set(mi_option_purge_delay,1);
+  #endif
+  #if defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // mi_option_set(mi_option_purge_delay,-1);
+    mi_option_set(mi_option_page_reclaim_on_free, 0);
  #endif

  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
@ -359,9 +396,10 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
  thread_entry_fun = fun;
  DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
  HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
+  thandles[0] = GetCurrentThread(); // avoid lint warning
  const size_t start = (main_participates ? 1 : 0);
  for (size_t i = start; i < nthreads; i++) {
-    thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
+    thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]);
  }
  if (main_participates) fun(0); // run the main thread as well
  for (size_t i = start; i < nthreads; i++) {