diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ce084f6..0d780fa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENE
   set(MI_ARCH "x64")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
   set(MI_ARCH "arm64")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567].?|ARM)$")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
   set(MI_ARCH "arm32")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
   if(CMAKE_SIZEOF_VOID_P==4)
@@ -173,8 +173,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
     list(APPEND mi_cflags -Wall)
 endif()
 
-# force C++ compilation with msvc or clang-cl to use modern C++ atomics
-if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel" OR MI_CLANG_CL)
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
   set(MI_USE_CXX "ON")
 endif()
 
@@ -435,7 +434,7 @@ endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
   if(MI_OPT_ARCH)
-    if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
+    if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
       if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
         list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
       endif()
@@ -533,9 +532,7 @@ if(MI_TRACK_ASAN)
 endif()
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
 list(APPEND mi_defines "MI_CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE_LC}")  #todo: multi-config project needs $<CONFIG> ?
-if(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$")
-  list(APPEND mi_defines MI_BUILD_RELEASE)
-else()
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$"))
   set(mi_libname "${mi_libname}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
 endif()
 
@@ -585,7 +582,7 @@ if(MI_BUILD_SHARED)
   install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
   install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
 
-  if(WIN32 AND NOT MINGW)
+  if(WIN32)
     # On windows, the import library name for the dll would clash with the static mimalloc.lib library
     # so we postfix the dll import library with `.dll.lib` (and also the .pdb debug file)
     set_property(TARGET mimalloc PROPERTY ARCHIVE_OUTPUT_NAME "${mi_libname}.dll" )
@@ -595,9 +592,6 @@ if(MI_BUILD_SHARED)
     # install(FILES "$<TARGET_FILE_DIR:mimalloc>/${mi_libname}.dll.pdb" DESTINATION ${CMAKE_INSTALL_LIBDIR})
   endif()
   if(WIN32 AND MI_WIN_REDIRECT)
-    if(MINGW)
-      set_property(TARGET mimalloc PROPERTY PREFIX "")
-    endif()
     # On windows, link and copy the mimalloc redirection dll too.
     if(CMAKE_GENERATOR_PLATFORM STREQUAL "arm64ec")
       set(MIMALLOC_REDIRECT_SUFFIX "-arm64ec")
@@ -713,12 +707,10 @@ if (MI_BUILD_TESTS)
     target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
     target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})
     target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include)
-    if(MI_BUILD_STATIC AND NOT MI_DEBUG_TSAN)
-      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
-    elseif(MI_BUILD_SHARED)
+    if(MI_BUILD_SHARED AND (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
       target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
     else()
-      message(STATUS "cannot build TSAN tests without MI_BUILD_SHARED being enabled")
+      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
     endif()
     add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME})
   endforeach()
@@ -727,19 +719,21 @@ if (MI_BUILD_TESTS)
   if(MI_BUILD_SHARED AND NOT (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
     add_executable(mimalloc-test-stress-dynamic test/test-stress.c)
     target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE ${mi_defines} "USE_STD_MALLOC=1")
+    if(WIN32)
+      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")
+    endif()
     target_compile_options(mimalloc-test-stress-dynamic PRIVATE ${mi_cflags})
     target_include_directories(mimalloc-test-stress-dynamic PRIVATE include)
+    target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # mi_version
     if(WIN32)
-      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")  # link mi_version
-      target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # link mi_version
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
     else()
       if(APPLE)
         set(LD_PRELOAD "DYLD_INSERT_LIBRARIES")
       else()
         set(LD_PRELOAD "LD_PRELOAD")
       endif()
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
     endif()
   endif()
 endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 83d6a482..a803cd15 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -6,8 +6,10 @@
 trigger:
   branches:
     include:
-    - main
-    - dev*
+    - master
+    - dev
+    - dev2
+    - dev3
   tags:
     include:
     - v*
@@ -32,22 +34,6 @@ jobs:
         BuildType: secure
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
         MSBuildConfiguration: Release
-      Debug x86:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -A Win32
-        MSBuildConfiguration: Debug
-      Release x86:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -A Win32
-        MSBuildConfiguration: Release
-      Debug Fixed TLS:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Debug
-      Release Fixed TLS:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Release
   steps:
   - task: CMake@1
     inputs:
@@ -175,7 +161,6 @@ jobs:
   - script: ctest --verbose --timeout 240
     workingDirectory: $(BuildType)
     displayName: CTest
-    
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-macos-$(BuildType)
 
@@ -183,6 +168,35 @@ jobs:
 # Other OS versions (just debug mode)
 # ----------------------------------------------------------
 
+- job:
+  displayName: Windows 2019
+  pool:
+    vmImage:
+      windows-2019
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+        MSBuildConfiguration: Debug
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+        MSBuildConfiguration: Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - task: MSBuild@1
+    inputs:
+      solution: $(BuildType)/libmimalloc.sln
+      configuration: '$(MSBuildConfiguration)'
+      msbuildArguments: -m
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
 - job:
   displayName: Ubuntu 24.04
   pool:
diff --git a/bin/mimalloc-redirect-arm64.dll b/bin/mimalloc-redirect-arm64.dll
old mode 100755
new mode 100644
index 27172d2c..e6360285
Binary files a/bin/mimalloc-redirect-arm64.dll and b/bin/mimalloc-redirect-arm64.dll differ
diff --git a/bin/mimalloc-redirect-arm64.lib b/bin/mimalloc-redirect-arm64.lib
old mode 100755
new mode 100644
index dca80b9b..11d71ef9
Binary files a/bin/mimalloc-redirect-arm64.lib and b/bin/mimalloc-redirect-arm64.lib differ
diff --git a/bin/mimalloc-redirect-arm64ec.dll b/bin/mimalloc-redirect-arm64ec.dll
old mode 100755
new mode 100644
index a228af39..f5ee4e47
Binary files a/bin/mimalloc-redirect-arm64ec.dll and b/bin/mimalloc-redirect-arm64ec.dll differ
diff --git a/bin/mimalloc-redirect-arm64ec.lib b/bin/mimalloc-redirect-arm64ec.lib
old mode 100755
new mode 100644
index 0ce77436..b88e8fc1
Binary files a/bin/mimalloc-redirect-arm64ec.lib and b/bin/mimalloc-redirect-arm64ec.lib differ
diff --git a/bin/mimalloc-redirect.dll b/bin/mimalloc-redirect.dll
old mode 100755
new mode 100644
index ec035f1b..9e015cc6
Binary files a/bin/mimalloc-redirect.dll and b/bin/mimalloc-redirect.dll differ
diff --git a/bin/mimalloc-redirect.lib b/bin/mimalloc-redirect.lib
old mode 100755
new mode 100644
index 785fa475..1d710c01
Binary files a/bin/mimalloc-redirect.lib and b/bin/mimalloc-redirect.lib differ
diff --git a/bin/mimalloc-redirect32.dll b/bin/mimalloc-redirect32.dll
old mode 100755
new mode 100644
index 92578f24..32799ffe
Binary files a/bin/mimalloc-redirect32.dll and b/bin/mimalloc-redirect32.dll differ
diff --git a/bin/mimalloc-redirect32.lib b/bin/mimalloc-redirect32.lib
old mode 100755
new mode 100644
index bf649787..e2927250
Binary files a/bin/mimalloc-redirect32.lib and b/bin/mimalloc-redirect32.lib differ
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index dfe78468..7f3bd631 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
-set(mi_version_major 2)
-set(mi_version_minor 2)
-set(mi_version_patch 5)
+set(mi_version_major 1)
+set(mi_version_minor 9)
+set(mi_version_patch 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/contrib/docker/alpine-arm32v7/Dockerfile b/contrib/docker/alpine-arm32v7/Dockerfile
index daa60f50..f74934fb 100644
--- a/contrib/docker/alpine-arm32v7/Dockerfile
+++ b/contrib/docker/alpine-arm32v7/Dockerfile
@@ -1,6 +1,6 @@
 # install from an image
 # download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
+# from: <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
 FROM scratch
 
 # Substitute the image name that was downloaded
diff --git a/contrib/docker/alpine-x86/Dockerfile b/contrib/docker/alpine-x86/Dockerfile
deleted file mode 100644
index a0f76c17..00000000
--- a/contrib/docker/alpine-x86/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-# install from an image
-# download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/x86>
-FROM scratch
-
-# Substitute the image name that was downloaded
-ADD alpine-minirootfs-20250108-x86.tar.gz /
-
-# Install tools
-RUN apk add build-base make cmake
-RUN apk add git
-RUN apk add vim
-
-RUN mkdir -p  /home/dev
-WORKDIR /home/dev
-
-# Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev2
-RUN mkdir -p mimalloc/out/release
-RUN mkdir -p mimalloc/out/debug
-
-# Build mimalloc debug
-WORKDIR /home/dev/mimalloc/out/debug
-RUN cmake ../.. -DMI_DEBUG_FULL=ON
-# RUN make -j
-# RUN make test
-
-CMD ["/bin/sh"]
diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake
index abb90af9..69661526 100644
--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@@ -4,12 +4,12 @@ vcpkg_from_github(
   HEAD_REF master
 
   # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1).
-  REF "v${VERSION}"
-  # REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774
+  # REF "v${VERSION}"
+  REF 866ce5b89db1dbc3e66bbf89041291fd16329518
 
   # The sha512 is the hash of the tar.gz bundle.
-  # (To get the sha512, run `vcpkg install "mimalloc[override]" --overlay-ports=./contrib/vcpkg` and copy the sha from the error message.)
-  SHA512 5218fcd3ad285687ed3f78b4651d7d3aee92b6f28e6c563a884975e654a43c94c4e5c02c5ed0322c3d3627d83d4843df2d2d8441f09aa18d00674ca9fd657345
+  # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=<dir of this file>` and copy the sha from the error message.)
+  SHA512 0b0e5ff823c49b9534b8c32800679806c5d7c29020af058da043c3e6e36ae3c32a1cdd5a21ece97dd60bc7dd4703967f683beac435dbb8514638a6cc55e5dea8
 )
 
 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
@@ -19,7 +19,6 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
     secure      MI_SECURE
     override    MI_OVERRIDE
     optarch     MI_OPT_ARCH
-    nooptarch   MI_NO_OPT_ARCH
     optsimd     MI_OPT_SIMD
     xmalloc     MI_XMALLOC
     asm         MI_SEE_ASM
diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json
index 42f2aa35..45f8097b 100644
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@@ -1,7 +1,7 @@
 {
   "name": "mimalloc",
-  "version": "2.2.4",
-  "port-version": 1,
+  "version": "1.9.2",
+  "port-version": 2,
   "description": "Compact general purpose allocator with excellent performance",
   "homepage": "https://github.com/microsoft/mimalloc",
   "license": "MIT",
@@ -35,9 +35,6 @@
     "optarch": {
       "description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
     },
-    "nooptarch": {
-      "description": "Do _not_ use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
-    },
     "optsimd": {
       "description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)"
     },
diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 128a4ff6..d6af71ce 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -282,8 +282,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override-dll.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc.h b/include/mimalloc.h
index d895d925..4e9c3156 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 225  // major + 2 digits minor
+#define MI_MALLOC_VERSION 192   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -97,7 +97,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>     // size_t
 #include <stdbool.h>    // bool
-#include <stdint.h>     // INTPTR_MAX
 
 #ifdef __cplusplus
 extern "C" {
@@ -154,21 +153,17 @@ mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
 mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
 mi_decl_export void mi_options_print(void)    mi_attr_noexcept;
 
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
                                     size_t* current_rss, size_t* peak_rss,
                                     size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
 
-
-// Generally do not use the following as these are usually called automatically
-mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
-mi_decl_export void mi_cdecl mi_process_done(void) mi_attr_noexcept;
-mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-
-
 // -------------------------------------------------------------------------------------
 // Aligned allocation
 // Note that `alignment` always follows `size` for consistency with unaligned
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index e8bac316..6eaa6f99 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -111,7 +111,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
@@ -121,7 +120,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
@@ -268,13 +266,6 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int6
   return current;
 #endif
 }
-static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
-  const int64_t add = *padd;
-  if (add != 0) {
-    mi_atomic_addi64_relaxed((volatile _Atomic(int64_t)*)p, add);
-  }
-}
-
 static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
   int64_t current;
   do {
@@ -305,7 +296,6 @@ static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p,
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
@@ -373,9 +363,8 @@ static inline void mi_atomic_yield(void) {
   _mm_pause();
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
-      (defined(__x86_64__) || defined(__i386__) || \
-       defined(__aarch64__) || defined(__arm__) || \
-       defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__))
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("pause" ::: "memory");
@@ -384,16 +373,10 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("wfe");
 }
-#elif defined(__arm__)
-#if __ARM_ARCH >= 7
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("yield" ::: "memory");
 }
-#else
-static inline void mi_atomic_yield(void) {
-  __asm__ volatile ("nop" ::: "memory");
-}
-#endif
 #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
 #ifdef __APPLE__
 static inline void mi_atomic_yield(void) {
@@ -404,6 +387,10 @@ static inline void mi_atomic_yield(void) {
   __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
 #endif
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
 #endif
 #elif defined(__sun)
 // Fallback for other archs
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ca5be930..106da0d1 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -8,6 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_INTERNAL_H
 #define MIMALLOC_INTERNAL_H
 
+
 // --------------------------------------------------------------------------
 // This file contains the internal API's of mimalloc and various utility
 // functions and macros.
@@ -16,88 +17,50 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "types.h"
 #include "track.h"
 
-
-// --------------------------------------------------------------------------
-// Compiler defines
-// --------------------------------------------------------------------------
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
 #define mi_trace_message(...)
 #endif
 
-#define mi_decl_cache_align     mi_decl_align(64)
-
+#define MI_CACHE_LINE          64
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
-#define mi_decl_align(a)        __declspec(align(a))
-#define mi_decl_noreturn        __declspec(noreturn)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
-#define mi_decl_align(a)        __attribute__((aligned(a)))
-#define mi_decl_noreturn        __attribute__((noreturn))
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
 #define mi_decl_weak            __attribute__((weak))
 #define mi_decl_hidden          __attribute__((visibility("hidden")))
-#if (__GNUC__ >= 4) || defined(__clang__)
-#define mi_decl_cold            __attribute__((cold))
-#else
-#define mi_decl_cold
-#endif
 #elif __cplusplus >= 201103L    // c++11
 #define mi_decl_noinline
 #define mi_decl_thread          thread_local
-#define mi_decl_align(a)        alignas(a)
-#define mi_decl_noreturn        [[noreturn]]
+#define mi_decl_cache_align     alignas(MI_CACHE_LINE)
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_align(a)
-#define mi_decl_noreturn
+#define mi_decl_cache_align
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
-#define mi_likely(x)       (__builtin_expect(!!(x),true))
-#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-#define mi_unlikely(x)     (x) [[unlikely]]
-#define mi_likely(x)       (x) [[likely]]
-#else
-#define mi_unlikely(x)     (x)
-#define mi_likely(x)       (x)
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x)    0
-#endif
-
-#if defined(__cplusplus)
-#define mi_decl_externc     extern "C"
-#else
-#define mi_decl_externc
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
 #define __wasi__
 #endif
 
-
-// --------------------------------------------------------------------------
-// Internal functions
-// --------------------------------------------------------------------------
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
 
 // "libc.c"
 #include    <stdarg.h>
@@ -133,10 +96,10 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_hidden mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
-void        _mi_auto_process_init(void);
-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept;
+void        _mi_process_load(void);
+void mi_cdecl _mi_process_done(void);
 bool        _mi_is_redirected(void);
 bool        _mi_allocator_init(const char** message);
 void        _mi_allocator_done(void);
@@ -154,7 +117,6 @@ void        _mi_heap_guarded_init(mi_heap_t* heap);
 // os.c
 void        _mi_os_init(void);                                            // called from process init
 void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
-void*       _mi_os_zalloc(size_t size, mi_memid_t* memid);
 void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
 void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
 
@@ -164,14 +126,12 @@ bool        _mi_os_has_overcommit(void);
 bool        _mi_os_has_virtual_reserve(void);
 
 bool        _mi_os_reset(void* addr, size_t size);
+bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
 bool        _mi_os_decommit(void* addr, size_t size);
+bool        _mi_os_protect(void* addr, size_t size);
 bool        _mi_os_unprotect(void* addr, size_t size);
 bool        _mi_os_purge(void* p, size_t size);
 bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size);
-void        _mi_os_reuse(void* p, size_t size);
-mi_decl_nodiscard bool _mi_os_commit(void* p, size_t size, bool* is_zero);
-mi_decl_nodiscard bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
-bool        _mi_os_protect(void* addr, size_t size);
 
 void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
 void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
@@ -179,10 +139,8 @@ void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t
 void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 bool        _mi_os_use_large_page(size_t size, size_t alignment);
 size_t      _mi_os_large_page_size(void);
-void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
-int         _mi_os_numa_node_count(void);
-int         _mi_os_numa_node(void);
+void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
@@ -219,11 +177,10 @@ void        _mi_segment_map_freed_at(const mi_segment_t* segment);
 void        _mi_segment_map_unsafe_destroy(void);
 
 // "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
-void       _mi_segment_collect(mi_segment_t* segment, bool force);
+mi_page_t*  _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld);
+void        _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void        _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+uint8_t*    _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
 
 #if MI_HUGE_PAGE_ABANDON
 void        _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@@ -231,11 +188,10 @@ void        _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, m
 void        _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif
 
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
-bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
-bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+void        _mi_segments_collect(bool force, mi_segments_tld_t* tld);
+void        _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+bool        _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
+bool        _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // "page.c"
 void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -258,7 +214,6 @@ void        _mi_deferred_free(mi_heap_t* heap, bool force);
 void        _mi_page_free_collect(mi_page_t* page,bool force);
 void        _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
 
-size_t      _mi_page_bin(const mi_page_t* page); // for stats
 size_t      _mi_bin_size(size_t bin);            // for stats
 size_t      _mi_bin(size_t size);                // for stats
 
@@ -275,7 +230,6 @@ bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* pa
 
 // "stats.c"
 void        _mi_stats_done(mi_stats_t* stats);
-void        _mi_stats_merge_thread(mi_tld_t* tld);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@@ -297,6 +251,26 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
 /* -----------------------------------------------------------
   Error codes passed to `_mi_fatal_error`
   All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
@@ -321,32 +295,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
-// ------------------------------------------------------
-// Assertions
-// ------------------------------------------------------
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func) mi_attr_noexcept;
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
-
-
 /* -----------------------------------------------------------
   Inlined definitions
 ----------------------------------------------------------- */
@@ -393,28 +341,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
-// Align downwards
-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}
 
 // Align a pointer upwards
 static inline void* mi_align_up_ptr(void* p, size_t alignment) {
   return (void*)_mi_align_up((uintptr_t)p, alignment);
 }
 
-// Align a pointer downwards
-static inline void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
-}
-
 
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
@@ -438,7 +370,6 @@ static inline bool mi_mem_is_zero(const void* p, size_t size) {
   return true;
 }
 
-
 // Align a byte size to a size in _machine words_,
 // i.e. byte size == `wsize*sizeof(void*)`.
 static inline size_t _mi_wsize_from_size(size_t size) {
@@ -533,44 +464,29 @@ static inline mi_segment_t* _mi_ptr_segment(const void* p) {
   #endif
 }
 
-static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
-  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
-  return (mi_page_t*)(s);
-}
-
-static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
-  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
-  return (mi_slice_t*)(p);
-}
-
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
   mi_assert_internal(page!=NULL);
   mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
   return segment;
 }
 
-static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
-  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
-  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
-  mi_assert_internal(start->slice_offset == 0);
-  mi_assert_internal(start + start->slice_count > slice);
-  return start;
+// used internally
+static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
+  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */);
+  size_t idx = (size_t)diff >> segment->page_shift;
+  mi_assert_internal(idx < segment->capacity);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
 }
 
-// Get the page containing the pointer (performance critical as it is called in mi_free)
+// Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  mi_assert_internal(p > (void*)segment);
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
-  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
-  mi_assert_internal(idx <= segment->slice_entries);
-  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
-  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
-  mi_assert_internal(slice->slice_offset == 0);
-  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
-  return mi_slice_to_page(slice);
+  size_t idx = _mi_segment_page_idx_of(segment, p);
+  return &((mi_segment_t*)segment)->pages[idx];
 }
 
 // Quick page start for initialized pages
@@ -593,8 +509,8 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) ||
-                     (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE));
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
   return page->is_huge;
 }
 
@@ -606,11 +522,7 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
 
 // size of a segment
 static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
-}
-
-static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
-  return (uint8_t*)segment + mi_segment_size(segment);
+  return segment->segment_size;
 }
 
 // Thread free access
@@ -765,13 +677,12 @@ static inline bool mi_is_in_same_segment(const void* p, const void* q) {
 }
 
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  if (_mi_ptr_segment(q) != segment) return false;
-  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  size_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  size_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
 }
 
 static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
@@ -853,50 +764,6 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
 }
 
 
-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = 0;
-  }
-}
-
-static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = ~((size_t)0);
-  }
-}
-
-static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != 0) return false;
-  }
-  return true;
-}
-
-static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != ~((size_t)0)) return false;
-  }
-  return true;
-}
-
-// defined in `segment.c`:
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
-
-#define mi_commit_mask_foreach(cm,idx,count) \
-  idx = 0; \
-  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
-
-#define mi_commit_mask_foreach_end() \
-    idx += count; \
-  }
-
-
-
 /* -----------------------------------------------------------
   memory id's
 ----------------------------------------------------------- */
@@ -912,10 +779,8 @@ static inline mi_memid_t _mi_memid_none(void) {
   return _mi_memid_create(MI_MEM_NONE);
 }
 
-static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
   mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
-  memid.mem.os.base = base;
-  memid.mem.os.size = size;
   memid.initially_committed = committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
@@ -947,6 +812,24 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
   return x;
 }
 
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(void);
+size_t _mi_os_numa_node_count_get(void);
+
+extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get();
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+
 
 
 // -----------------------------------------------------------------------
@@ -987,7 +870,7 @@ static inline size_t mi_clz(size_t x) {
   #else
     _BitScanReverse64(&idx, x);
   #endif
-  return ((MI_SIZE_BITS - 1) - (size_t)idx);
+  return ((MI_SIZE_BITS - 1) - idx);
 }
 static inline size_t mi_ctz(size_t x) {
   if (x==0) return MI_SIZE_BITS;
@@ -997,7 +880,7 @@ static inline size_t mi_ctz(size_t x) {
   #else
     _BitScanForward64(&idx, x);
   #endif
-  return (size_t)idx;
+  return idx;
 }
 
 #else
@@ -1064,21 +947,6 @@ static inline size_t mi_bsr(size_t x) {
   return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x));
 }
 
-size_t _mi_popcount_generic(size_t x);
-
-static inline size_t mi_popcount(size_t x) {
-  if (x<=1) return x;
-  if (x==SIZE_MAX) return MI_SIZE_BITS;
-  #if defined(__GNUC__)
-    #if (SIZE_MAX == ULONG_MAX)
-      return __builtin_popcountl(x);
-    #else
-      return __builtin_popcountll(x);
-    #endif
-  #else
-    return _mi_popcount_generic(x);
-  #endif
-}
 
 // ---------------------------------------------------------------------------------
 // Provide our own `_mi_memcpy` for potential performance optimizations.
@@ -1090,8 +958,8 @@ static inline size_t mi_popcount(size_t x) {
 
 #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-extern mi_decl_hidden bool _mi_cpu_has_fsrm;
-extern mi_decl_hidden bool _mi_cpu_has_erms;
+extern bool _mi_cpu_has_fsrm;
+extern bool _mi_cpu_has_erms;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
     __movsb((unsigned char*)dst, (const unsigned char*)src, n);
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 1087d9b8..bddd66e9 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -59,15 +59,10 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
 // pre: needs_recommit != NULL
 int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
 
-// Reset memory. The range keeps being accessible but the content might be reset to zero at any moment.
+// Reset memory. The range keeps being accessible but the content might be reset.
 // Returns error code or 0 on success.
 int _mi_prim_reset(void* addr, size_t size);
 
-// Reuse memory. This is called for memory that is already committed but
-// may have been reset (`_mi_prim_reset`) or decommitted (`_mi_prim_decommit`) where `needs_recommit` was false.
-// Returns error code or 0 on success. On most platforms this is a no-op.
-int _mi_prim_reuse(void* addr, size_t size);
-
 // Protect memory. Returns error code or 0 on success.
 int _mi_prim_protect(void* addr, size_t size, bool protect);
 
@@ -123,6 +118,9 @@ void _mi_prim_thread_done_auto_done(void);
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 
+
+
+
 //-------------------------------------------------------------------
 // Access to TLS (thread local storage) slots.
 // We need fast access to both a unique thread id (in `free.c:mi_free`) and
@@ -210,19 +208,19 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS)
 
 // On windows we can store the thread-local heap at a fixed TLS slot to avoid
-// thread-local initialization checks in the fast path.
-// We allocate a user TLS slot at process initialization (see `windows/prim.c`)
-// and store the offset `_mi_win_tls_offset`.
-#define MI_HAS_TLS_SLOT  1              // 2 = we can reliably initialize the slot (saving a test on each malloc)
+// thread-local initialization checks in the fast path. This uses a fixed location
+// in the TCB though (last user-reserved slot by default) which may clash with other applications.
 
-extern mi_decl_hidden size_t _mi_win_tls_offset;
+#define MI_HAS_TLS_SLOT      2              // 2 = we can reliably initialize the slot (saving a test on each malloc)
 
 #if MI_WIN_USE_FIXED_TLS > 1
 #define MI_TLS_SLOT     (MI_WIN_USE_FIXED_TLS)
 #elif MI_SIZE_SIZE == 4
-#define MI_TLS_SLOT     (0x0E10 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x710)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0xF0C)             // Last TlsSlot (might clash with other app reserved slot)
 #else
-#define MI_TLS_SLOT     (0x1480 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x888)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0x1678)            // Last TlsSlot (might clash with other app reserved slot)
 #endif
 
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
@@ -271,8 +269,8 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 
 // defined in `init.c`; do not use these directly
-extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 
@@ -400,7 +398,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 
 #elif defined(MI_TLS_PTHREAD)
 
-extern mi_decl_hidden pthread_key_t _mi_heap_default_key;
+extern pthread_key_t _mi_heap_default_key;
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a15d9cba..9f743149 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -13,9 +13,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
 // mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated. A segment is divided in slices (64KiB) from
-//                  which pages are allocated.
-// mi_page_t      : a "mimalloc" page (usually 64KiB or 512KiB) from
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
@@ -67,10 +66,10 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_DEBUG 2  // + internal assertion checks
 // #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
-#if defined(MI_BUILD_RELEASE) || defined(NDEBUG)
-#define MI_DEBUG 0
-#else
+#if !defined(NDEBUG) || defined(_DEBUG)
 #define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
 #endif
 #endif
 
@@ -168,40 +167,38 @@ typedef int32_t  mi_ssize_t;
 // ------------------------------------------------------
 
 // Main tuning parameters for segment and page sizes
-// Sizes for 64-bit (usually divide by two for 32-bit)
-#ifndef MI_SEGMENT_SLICE_SHIFT
-#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
-#endif
-
-#ifndef MI_SEGMENT_SHIFT
-#if MI_INTPTR_SIZE > 4
-#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
-#else
-#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
-#endif
-#endif
-
+// Sizes for 64-bit, divide by two for 32-bit
 #ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
 #endif
 #ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
+#endif
+#ifndef MI_LARGE_PAGE_SHIFT
+#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
+#endif
+#ifndef MI_SEGMENT_SHIFT
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
 #endif
 
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
 #define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
-#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
-#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
 
 #define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
 #define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
 
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/8)   // 8 KiB on 64-bit
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/8)  // 64 KiB on 64-bit
-#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 16 MiB on 64-bit
+#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
+#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
+#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/8)   // 8 KiB
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/8)  // 64 KiB
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/4)   // 1 MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
@@ -209,27 +206,18 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: expecting 73 bins"
 #endif
 
-#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
 #error "mimalloc internal: define more bins"
 #endif
 
 // Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE            (MI_MEDIUM_OBJ_SIZE_MAX)
+#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
 
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX            (MI_SEGMENT_SIZE >> 1)
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
-// Maximum slice count (255) for which we can find the page for interior pointers
-#define MI_MAX_SLICE_OFFSET_COUNT         ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
-
-// we never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-// on 64-bit+ systems we also limit the maximum allocation size such that the slice count fits in 32-bits. (issue #877)
-#if (PTRDIFF_MAX > INT32_MAX) && (PTRDIFF_MAX >= (MI_SEGMENT_SLIZE_SIZE * UINT32_MAX))
-#define MI_MAX_ALLOC_SIZE   (MI_SEGMENT_SLICE_SIZE * (UINT32_MAX-1))
-#else
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
-#endif
-
 
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@@ -308,8 +296,8 @@ typedef uintptr_t mi_thread_free_t;
 // Notes:
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 12 words on 64-bit which helps the page index calculations
-//   (and 14 words on 32-bit, and encoded free lists add 2 words)
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@@ -319,12 +307,12 @@ typedef uintptr_t mi_thread_free_t;
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
   // "owned" by the segment
-  uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
+  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
   uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
   uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`)
-                                           // padding
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
@@ -348,11 +336,12 @@ typedef struct mi_page_s {
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
 
-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 
-  // 64-bit 11 words, 32-bit 13 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
   void* padding[1];
+  #endif
 } mi_page_t;
 
 
@@ -365,44 +354,10 @@ typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
   MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
-typedef enum mi_segment_kind_e {
-  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // segment with just one huge page inside.
-} mi_segment_kind_t;
-
-// ------------------------------------------------------
-// A segment holds a commit mask where a bit is set if
-// the corresponding MI_COMMIT_SIZE area is committed.
-// The MI_COMMIT_SIZE must be a multiple of the slice
-// size. If it is equal we have the most fine grained
-// decommit (but setting it higher can be more efficient).
-// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
-// be committed in one go which can be set higher than
-// MI_COMMIT_SIZE for efficiency (while the decommit mask
-// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
-// ------------------------------------------------------
-
-#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)
-#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
-#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
-#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
-#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
-
-#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
-#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
-#endif
-
-typedef struct mi_commit_mask_s {
-  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
-} mi_commit_mask_t;
-
-typedef mi_page_t  mi_slice_t;
-typedef int64_t    mi_msecs_t;
-
 
 // ---------------------------------------------------------------
 // a memory id tracks the provenance of arena/OS allocated memory
@@ -446,57 +401,43 @@ typedef struct mi_memid_s {
 } mi_memid_t;
 
 
-// -----------------------------------------------------------------------------------------
-// Segments are large allocated memory blocks (32mb on 64 bit) from arenas or the OS.
-//
-// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
-// The start of a segment is this structure with a fixed number of slice entries (`slices`)
-// usually followed by a guard OS page and the actual allocation area with pages.
-// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`).
-// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent
-// slices part of the area, the `slice_offset` is the byte offset back to the first slice
-// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`).
-// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
-// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
-// large and huge pages span a variable amount of slices.
-
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
 typedef struct mi_subproc_s mi_subproc_t;
 
+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
+// Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
   // constant fields
-  mi_memid_t        memid;              // memory id for arena/OS allocation
-  bool              allow_decommit;     // can we decommmit the memory
-  bool              allow_purge;        // can we purge the memory (reset or decommit)
-  size_t            segment_size;
-  mi_subproc_t*     subproc;            // segment belongs to sub process
+  mi_memid_t           memid;            // memory id to track provenance
+  bool                 allow_decommit;
+  bool                 allow_purge;
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
+  mi_subproc_t*        subproc;          // segment belongs to sub process
 
   // segment fields
-  mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
-  mi_commit_mask_t  purge_mask;         // slices that can be purged
-  mi_commit_mask_t  commit_mask;        // slices that are currently committed
+  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
+  struct mi_segment_s* prev;
+  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
+  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
 
-  // from here is zero initialized
-  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
-  bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)
-  bool              dont_free;          // can be temporarily true to ensure the segment is not freed
+  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
 
-  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
-  size_t            used;               // count of pages in use
-  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+  size_t               used;             // count of pages in use (`used <= capacity`)
+  size_t               capacity;         // count of available pages (`#free + used`)
+  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
 
   struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
   struct mi_segment_s* abandoned_os_prev;
 
-  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.
-
   // layout like this to optimize access in `mi_free`
-  mi_segment_kind_t kind;
-  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-
-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one extra final entry for huge blocks with large alignment
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
+  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
 
 
@@ -571,6 +512,7 @@ struct mi_heap_s {
   size_t                guarded_size_min;                    // minimal size for guarded objects
   size_t                guarded_size_max;                    // maximal size for guarded objects
   size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_seed;                 // starting sample count
   size_t                guarded_sample_count;                // current sample count (counting down to 0)
   #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
@@ -599,19 +541,20 @@ struct mi_subproc_s {
 // Thread Local data
 // ------------------------------------------------------
 
-// A "span" is is an available range of slices. The span queues keep
-// track of slice spans of at most the given `slice_count` (but more than the previous size class).
-typedef struct mi_span_queue_s {
-  mi_slice_t* first;
-  mi_slice_t* last;
-  size_t      slice_count;
-} mi_span_queue_t;
+// Milliseconds as in `int64_t` to avoid overflows
+typedef int64_t  mi_msecs_t;
 
-#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+// Queue of segments
+typedef struct mi_segment_queue_s {
+  mi_segment_t* first;
+  mi_segment_t* last;
+} mi_segment_queue_t;
 
 // Segments thread local data
 typedef struct mi_segments_tld_s {
-  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  mi_segment_queue_t  small_free;   // queue of segments with free small pages
+  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
+  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
   size_t              count;        // current number of segments;
   size_t              peak_count;   // peak number of segments
   size_t              current_size; // current size of all segments
@@ -632,6 +575,7 @@ struct mi_tld_s {
 };
 
 
+
 // ------------------------------------------------------
 // Debug
 // ------------------------------------------------------
@@ -646,6 +590,26 @@ struct mi_tld_s {
 #define MI_DEBUG_PADDING    (0xDE)
 #endif
 
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
 
 // ------------------------------------------------------
 // Statistics
@@ -661,25 +625,22 @@ struct mi_tld_s {
 // add to stat keeping track of the peak
 void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
 void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
 // counters can just be increased
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
 #if (MI_STAT)
 #define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
 #define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
 #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
 #else
 #define mi_stat_increase(stat,amount)         ((void)0)
 #define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
 #define mi_stat_counter_increase(stat,amount) ((void)0)
 #endif
 
 #define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_adjust_decrease(heap,stat,amount)  mi_stat_adjust_decrease( (heap)->tld->stats.stat, amount)
+
 
 #endif
diff --git a/readme.md b/readme.md
index ddf358b2..81f2057e 100644
--- a/readme.md
+++ b/readme.md
@@ -12,9 +12,9 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release   : `v3.1.5` (beta) (2025-06-13).  
-Latest v2 release: `v2.2.4` (2025-06-09).  
-Latest v1 release: `v1.9.4` (2024-06-09).
+Latest release   : `v3.0.2` (beta) (2025-03-06).  
+Latest v2 release: `v2.2.2` (2025-03-06).  
+Latest v1 release: `v1.9.2` (2024-03-06).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -72,25 +72,18 @@ Enjoy!
 
 ### Branches
 
-* `main`: latest stable release (still based on `dev2`).
+* `master`: latest stable release (still based on `dev2`).
 * `dev`:  development branch for mimalloc v1. Use this branch for submitting PR's.
 * `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` 
           (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
           mimalloc pages that can reduce fragmentation.
-* `dev3`: development branch for mimalloc v3 beta. This branch is downstream of `dev`. This version 
-          simplifies the lock-free ownership of previous versions, and improves sharing of memory between 
-          threads. On certain large workloads this version may use (much) less memory.
+* `dev3`: development branch for mimalloc v3-beta. This branch is downstream of `dev`. This version 
+          simplifies the lock-free ownership of previous versions, has no thread-local segments any more. 
+          This improves sharing of memory between threads, and on certain large workloads may use less memory 
+          with less fragmentation.
 
 ### Releases
 
-* 2025-06-13, `v3.1.5`: Bug fix release where memory was not always correctly committed (issue #1098).
-* 2025-06-09, `v1.9.4`, `v2.2.4`, `v3.1.4` (beta) : Some important bug fixes, including a case where OS memory
-  was not always fully released. Improved v3 performance, build on XBox, fix build on Android, support interpose 
-  for older macOS versions, use MADV_FREE_REUSABLE on macOS, always check commit success, better support for Windows 
-  fixed TLS offset, etc.
-* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta) : Various small bug and build fixes, including:
-  fix arm32 pre v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
-  fix execution on non BMI1 x64 systems. 
 * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
   Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
   Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. 
@@ -104,13 +97,53 @@ Enjoy!
   add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
   from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
+* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing).
+  New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
+
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
+
+* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
+
+* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
+  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
+  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
+
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
+  Support arbitrary large alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
+  detection. Initial
+  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.
 
 * [Older release notes](#older-release-notes)
 
 Special thanks to:
 
-* Sergiy Kuryata for his contributions on reducing memory commit -- especially on Windows with the Windows thread pool (now implemented in v3).
-* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his _many_ contributions, and making
+* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
   mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
 * Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
   memory model bugs using the [genMC] model checker.
@@ -141,7 +174,7 @@ mimalloc is used in various large scale low-latency services and programs, for e
 
 Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
 The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
-`mimalloc-override-dll` project builds a DLL for overriding malloc
+`mimalloc-override-dll` project builds DLL for overriding malloc
 in the entire program.
 
 ## Linux, macOS, BSD, etc.
@@ -865,48 +898,6 @@ provided by the bot. You will only need to do this once across all repos using o
 
 # Older Release Notes
 
-* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
-* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
-  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
-  directly available (and new `block_size_shift` to improve aligned block free-ing).
-  New approach to collection of abandoned segments: When
-  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
-  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
-  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
-  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
-  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
-
-* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
-  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
-  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
-
-* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
-
-* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
-  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
-  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
-
-* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
-  Support arbitrary large alignments (in particular for `std::pmr` pools).
-  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
-  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
-  Various small bug fixes.
-
-* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
-  detection. Initial
-  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
-
-* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
-  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
-  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
-  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
-
-* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
-  Windows 11, fix compilation with musl, potentially reduced
-  committed memory, add `bin/minject` for Windows,
-  improved wasm support, faster aligned allocation,
-  various small fixes.
-
 * 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
   M1), improved performance for v2 for large objects, Python integration improvements, more standard
   installation directories, various small fixes.
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 3d3202eb..d0e691b3 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -115,7 +115,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
 
   // now zero the block if needed
   if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only the memory from the start of the large block is defined
+    // for the tracker, on huge aligned allocations only from the start of the large block is defined
     mi_track_mem_undefined(aligned_p, size);
     if (zero) {
       _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
@@ -191,6 +191,9 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
       const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
       if mi_likely(is_aligned)
       {
+        #if MI_STAT>1
+        mi_heap_stat_increase(heap, malloc_requested, size);
+        #endif
         void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
         mi_assert_internal(p != NULL);
         mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
@@ -217,11 +220,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap,
   return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }
 
-// ensure a definition is emitted
-#if defined(__cplusplus)
-void* _mi_extern_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned;
-#endif
-
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 52ab69c5..b5109ded 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -71,20 +71,24 @@ typedef void* mi_nothrow_t;
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
 
-  #define MI_INTERPOSE_DECLS(name)        __attribute__((used)) static struct mi_interpose_s name[]  __attribute__((section("__DATA, __interpose")))
-
-  MI_INTERPOSE_DECLS(_mi_interposes) =
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
+    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
+    MI_INTERPOSE_MI(strndup),
+    #endif
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
     MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
     MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    MI_INTERPOSE_MI(aligned_alloc),
+    #endif
     #ifdef MI_OSX_ZONE
     // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
     MI_INTERPOSE_MI(free),
@@ -95,12 +99,6 @@ typedef void* mi_nothrow_t;
     MI_INTERPOSE_FUN(vfree,mi_cfree),
     #endif
   };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_7) __OSX_AVAILABLE(10.7) = {
-    MI_INTERPOSE_MI(strndup),
-  };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_15) __OSX_AVAILABLE(10.15) = {
-    MI_INTERPOSE_MI(aligned_alloc),
-  };
 
   #ifdef __cplusplus
   extern "C" {
diff --git a/src/alloc.c b/src/alloc.c
index 0fed5e75..15867315 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -30,7 +30,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(size >= MI_PADDING_SIZE);
   mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
 
   // check the free list
@@ -83,13 +82,12 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
 
   #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_increase(heap, malloc_normal, bsize);
     mi_heap_stat_counter_increase(heap, malloc_normal_count, 1);
     #if (MI_STAT>1)
     const size_t bin = _mi_bin(bsize);
     mi_heap_stat_increase(heap, malloc_bins[bin], 1);
-    mi_heap_stat_increase(heap, malloc_requested, size - MI_PADDING_SIZE);
     #endif
   }
   #endif
@@ -148,6 +146,12 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);
   mi_track_malloc(p,size,zero);
 
+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
+  }
+  #endif
   #if MI_DEBUG>3
   if (p != NULL && zero) {
     mi_assert_expensive(mi_mem_is_zero(p, size));
@@ -184,6 +188,12 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
     void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
     mi_track_malloc(p,size,zero);
 
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
+    }
+    #endif
     #if MI_DEBUG>3
     if (p != NULL && zero) {
       mi_assert_expensive(mi_mem_is_zero(p, size));
@@ -630,7 +640,7 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
     // give up to place it right in front of the guard page if the offset is too large for unalignment
     offset = MI_BLOCK_ALIGNMENT_MAX;
   }
-  void* p = (uint8_t*)block + offset;
+  void* p = (uint8_t*)block + offset;  
   mi_track_align(block, p, offset, obj_size);
   mi_track_mem_defined(block, sizeof(mi_block_t));
   return p;
@@ -652,12 +662,11 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
   void* const p   = mi_block_ptr_set_guarded(block, obj_size);
 
   // stats
-  mi_track_malloc(p, size, zero);
+  mi_track_malloc(p, size, zero);  
   if (p != NULL) {
     if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
     #if MI_STAT>1
-    mi_heap_stat_adjust_decrease(heap, malloc_requested, req_size);
-    mi_heap_stat_increase(heap, malloc_requested, size);
+    mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
     #endif
     _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1);
   }
@@ -685,7 +694,7 @@ void* _mi_externs[] = {
   (void*)&mi_zalloc_small,
   (void*)&mi_heap_malloc,
   (void*)&mi_heap_zalloc,
-  (void*)&mi_heap_malloc_small,
+  (void*)&mi_heap_malloc_small
   // (void*)&mi_heap_alloc_new,
   // (void*)&mi_heap_alloc_new_n
 };
diff --git a/src/arena.c b/src/arena.c
index e97ca885..9d40a271 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -44,7 +44,7 @@ typedef struct mi_arena_s {
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be purged from `blocks_purge`.
-
+  
   mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
   mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
   mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
@@ -99,10 +99,6 @@ bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_i
   }
 }
 
-bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
-  return (memid.memkind == MI_MEM_OS);
-}
-
 size_t mi_arena_get_count(void) {
   return mi_atomic_load_relaxed(&mi_arena_count);
 }
@@ -192,9 +188,14 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
   if (p != NULL) return p;
 
   // or fall back to the OS
-  p = _mi_os_zalloc(size, memid);
+  p = _mi_os_alloc(size, memid);
   if (p == NULL) return NULL;
 
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
   return p;
 }
 
@@ -254,7 +255,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
 
   // set the dirty bits (todo: no need for an atomic op here?)
   if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
-    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL, NULL);
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
   }
 
   // set commit state
@@ -265,36 +266,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   else if (commit) {
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
     memid->initially_committed = true;
-    const size_t commit_size = mi_arena_block_size(needed_bcount);      
     bool any_uncommitted;
-    size_t already_committed = 0;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted, &already_committed);
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
-      mi_assert_internal(already_committed < needed_bcount);
-      const size_t stat_commit_size = commit_size - mi_arena_block_size(already_committed);
       bool commit_zero = false;
-      if (!_mi_os_commit_ex(p, commit_size, &commit_zero, stat_commit_size)) {
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero)) {
         memid->initially_committed = false;
       }
       else {
         if (commit_zero) { memid->initially_zero = true; }
       }
     }
-    else {
-      // all are already committed: signal that we are reusing memory in case it was purged before
-      _mi_os_reuse( p, commit_size );
-    }
   }
   else {
     // no need to commit, but check if already fully committed
-    size_t already_committed = 0;
-    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &already_committed);
-    if (!memid->initially_committed && already_committed > 0) {
-      // partially committed: as it will be committed at some time, adjust the stats and pretend the range is fully uncommitted.
-      mi_assert_internal(already_committed < needed_bcount);
-      _mi_stat_decrease(&_mi_stats_main.committed, mi_arena_block_size(already_committed));
-      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
-    }
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
 
   return p;
@@ -368,7 +354,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id)
 {
   if (_mi_preloading()) return false;  // use OS only while pre loading
-
+  
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -410,7 +396,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0)
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) 
     {
       void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
@@ -478,19 +464,17 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks)
   const size_t size = mi_arena_block_size(blocks);
   void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
-  size_t already_committed = 0;
-  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx, &already_committed)) {
+  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
     // all blocks are committed, we can purge freely
-    mi_assert_internal(already_committed == blocks);
     needs_recommit = _mi_os_purge(p, size);
   }
   else {
     // some blocks are not committed -- this can happen when a partially committed block is freed
     // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory).
-    mi_assert_internal(already_committed < blocks);
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, mi_arena_block_size(already_committed));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, 0);    
   }
 
   // clear the purged blocks
@@ -524,7 +508,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
     else {
       // already an expiration was set
     }
-    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL, NULL);
+    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
   }
 }
 
@@ -559,7 +543,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
   // check pre-conditions
   if (arena->memid.is_pinned) return false;
-
+   
   // expired yet?
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
   if (!force && (expire == 0 || expire > now)) return false;
@@ -614,7 +598,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   return any_purged;
 }
 
-static void mi_arenas_try_purge( bool force, bool visit_all )
+static void mi_arenas_try_purge( bool force, bool visit_all ) 
 {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
@@ -631,7 +615,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all )
   mi_atomic_guard(&purge_guard)
   {
     // increase global expire: at most one purge per delay cycle
-    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());
+    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());  
     size_t max_purge_count = (visit_all ? max_arena : 2);
     bool all_visited = true;
     for (size_t i = 0; i < max_arena; i++) {
@@ -664,16 +648,15 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
-  const size_t decommitted_size = (committed_size <= size ? size - committed_size : 0);
 
   // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
   mi_track_mem_undefined(p,size);
 
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
-    if (!all_committed && decommitted_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will decrease commit by the full size)
-      _mi_stat_increase(&_mi_stats_main.committed, decommitted_size);
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
     }
     _mi_os_free(p, size, memid);
   }
@@ -707,14 +690,14 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       mi_assert_internal(arena->blocks_purge != NULL);
 
       if (!all_committed) {
-        // mark the entire range as no longer committed (so we will recommit the full range when re-using)
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
         _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
         mi_track_mem_noaccess(p,size);
-        //if (committed_size > 0) {
+        if (committed_size > 0) {
           // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we do no longer decrease the commit if the range is not marked entirely as committed.
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
           _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        //}
+        }
         // note: if not all committed, it may be that the purge will reset/decommit the entire range
         // that contains already decommitted parts. Since purge consistently uses reset or decommit that
         // works (as we should never reset decommitted parts).
@@ -950,7 +933,7 @@ void mi_debug_show_arenas(void) mi_attr_noexcept {
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
-    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, (size_t)(MI_ARENA_BLOCK_SIZE / MI_MiB), arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
       inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
     }
@@ -1010,17 +993,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
   if (pages == 0) return 0;
 
   // pages per numa node
-  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
-  if (numa_count == 0) numa_count = 1;
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
   const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
 
   // reserve evenly among numa nodes
-  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
-    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
diff --git a/src/bitmap.c b/src/bitmap.c
index 32d1e954..9ef784d6 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -34,17 +34,17 @@ static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
 }
 
 
+
 /* -----------------------------------------------------------
   Claim a bit sequence atomically
 ----------------------------------------------------------- */
 
 // Try to atomically claim a sequence of `count` bits in a single
 // field at `idx` in `bitmap`. Returns `true` on success.
-inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
   mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
   mi_bitmap_field_t* field = &bitmap[idx];
   size_t map  = mi_atomic_load_relaxed(field);
   if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
@@ -94,9 +94,9 @@ inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, cons
   return false;
 }
 
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
@@ -108,24 +108,6 @@ bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fiel
   return false;
 }
 
-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, 
-            const size_t start_field_idx, const size_t count, 
-            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,            
-            mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { 
-        return true;
-      }
-      // predicate returned false, unclaim and look further
-      _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx);
-    }
-  }
-  return false;
-}
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
@@ -246,7 +228,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
 
   // intermediate fields
   while (++field < final_field) {
-    newmap = MI_BITMAP_FIELD_FULL;
+    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
     map = 0;
     if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
   }
@@ -268,7 +250,7 @@ rollback:
   // (we just failed to claim `field` so decrement first)
   while (--field > initial_field) {
     newmap = 0;
-    map = MI_BITMAP_FIELD_FULL;
+    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
     mi_assert_internal(mi_atomic_load_relaxed(field) == map);
     mi_atomic_store_release(field, newmap);
   }
@@ -369,7 +351,7 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set) {
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
   size_t pre_mask;
   size_t mid_mask;
@@ -377,31 +359,28 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
   size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_zero = true;
   bool any_zero = false;
-  size_t one_count = 0;
   _Atomic(size_t)*field = &bitmap[idx];
   size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & pre_mask); }
+  if ((prev & pre_mask) != 0) all_zero = false;
   if ((prev & pre_mask) != pre_mask) any_zero = true;
   while (mid_count-- > 0) {
     prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & mid_mask); }
+    if ((prev & mid_mask) != 0) all_zero = false;
     if ((prev & mid_mask) != mid_mask) any_zero = true;
   }
   if (post_mask!=0) {
     prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & post_mask); }
+    if ((prev & post_mask) != 0) all_zero = false;
     if ((prev & post_mask) != post_mask) any_zero = true;
   }
   if (pany_zero != NULL) { *pany_zero = any_zero; }
-  if (already_set != NULL) { *already_set = one_count; };
-  mi_assert_internal(all_zero ? one_count == 0 : one_count <= count);
   return all_zero;
 }
 
 
 // Returns `true` if all `count` bits were 1.
 // `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones, size_t* already_set) {
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
   size_t pre_mask;
   size_t mid_mask;
@@ -409,33 +388,30 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field
   size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_ones = true;
   bool any_ones = false;
-  size_t one_count = 0;
   mi_bitmap_field_t* field = &bitmap[idx];
   size_t prev = mi_atomic_load_relaxed(field++);
   if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & pre_mask); }
+  if ((prev & pre_mask) != 0) any_ones = true;
   while (mid_count-- > 0) {
     prev = mi_atomic_load_relaxed(field++);
     if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & mid_mask); }
+    if ((prev & mid_mask) != 0) any_ones = true;
   }
   if (post_mask!=0) {
     prev = mi_atomic_load_relaxed(field);
     if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & post_mask); }
+    if ((prev & post_mask) != 0) any_ones = true;
   }
   if (pany_ones != NULL) { *pany_ones = any_ones; }
-  if (already_set != NULL) { *already_set = one_count; }
-  mi_assert_internal(all_ones ? one_count == count : one_count < count);
   return all_ones;
 }
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL, already_set);
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
 }
 
 bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones, NULL);
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
   return any_ones;
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index 0f4744f4..d60668cb 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -44,11 +44,6 @@ static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx
   return mi_bitmap_index_create_ex(idx,bitidx);
 }
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
-  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
-}
-
 // Get the field index from a bit index.
 static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
   return (bitmap_idx / MI_BITMAP_FIELD_BITS);
@@ -76,10 +71,6 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
 // For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
 
-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg);
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx);
-
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
@@ -111,9 +102,9 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set);
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set);
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
 #endif
diff --git a/src/free.c b/src/free.c
index 5e5ae443..a1732e8c 100644
--- a/src/free.c
+++ b/src/free.c
@@ -35,9 +35,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   mi_check_padding(page, block);
   if (track_stats) { mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
-  if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  }
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
   if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
 
@@ -123,16 +121,10 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 
   #if (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
-  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
-    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
-  #else
-    {
-  #endif
-      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-        "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-        _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-      }
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
     }
   }
   #endif
@@ -280,7 +272,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
   _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
-  if (segment->kind == MI_SEGMENT_HUGE) {
+  if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
     // huge page segments are always abandoned and can be freed immediately
     _mi_segment_huge_page_free(segment, page, block);
@@ -348,10 +340,7 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 
 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
   MI_UNUSED_RELEASE(size);
-  #if MI_DEBUG
-  const size_t available = _mi_usable_size(p,"mi_free_size");
-  mi_assert(p == NULL || size <= available || available == 0 /* invalid pointer */ );
-  #endif
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
   mi_free(p);
 }
 
@@ -525,24 +514,24 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
   MI_UNUSED(block);
+#endif
   mi_heap_t* const heap = mi_heap_get_default();
   const size_t bsize = mi_page_usable_block_size(page);
-  // #if (MI_STAT>1)
-  // const size_t usize = mi_page_usable_size_of(page, block);
-  // mi_heap_stat_decrease(heap, malloc_requested, usize);
-  // #endif
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc_requested, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_decrease(heap, malloc_normal, bsize);
-    #if (MI_STAT > 1)
+#if (MI_STAT > 1)
     mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], 1);
-    #endif
+#endif
   }
-  //else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-  //  mi_heap_stat_decrease(heap, malloc_large, bsize);
-  //}
   else {
-    mi_heap_stat_decrease(heap, malloc_huge, bsize);
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
+    mi_heap_stat_decrease(heap, malloc_huge, bpsize);
   }
 }
 #else
diff --git a/src/heap.c b/src/heap.c
index f96e60d0..7c235a7b 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -95,11 +95,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
   mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
   mi_collect_t collect = *((mi_collect_t*)arg_collect);
   _mi_page_free_collect(page, collect >= MI_FORCE);
-  if (collect == MI_FORCE) {
-    // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment.
-    mi_segment_t* segment = _mi_page_segment(page);
-    _mi_segment_collect(segment, true /* force? */);
-  }
   if (mi_page_all_free(page)) {
     // no more used blocks, free the page.
     // note: this will free retired pages as well.
@@ -132,15 +127,14 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
 
   // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  const bool force_main =
-    #ifdef NDEBUG
+  if (
+  #ifdef NDEBUG
       collect == MI_FORCE
-    #else
+  #else
       collect >= MI_FORCE
-    #endif
-      && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim;
-
-  if (force_main) {
+  #endif
+    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
+  {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
     // note: this only collects in the current subprocess
@@ -163,9 +157,8 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
-  // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list)
-  // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
+  // collect segments (purge pages, this can be expensive so don't force on abandonment)
+  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
 
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && is_main_thread && mi_heap_is_backing(heap)) {
@@ -176,7 +169,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 
   // merge statistics
-  if (collect <= MI_FORCE) { _mi_stats_merge_thread(heap->tld); }
+  if (collect <= MI_FORCE) {
+    mi_stats_merge();
+  }
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -333,26 +328,20 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 
   // stats
   const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
-    //if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    //  mi_heap_stat_decrease(heap, malloc_large, bsize);
-    //}
-    //else 
-    {
-      mi_heap_stat_decrease(heap, malloc_huge, bsize);
-    }
+  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, malloc_huge, bsize);
   }
-  #if (MI_STAT>0)
+#if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
   const size_t inuse = page->used;
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_decrease(heap, malloc_normal, bsize * inuse);
-    #if (MI_STAT>1)
+#if (MI_STAT>1)
     mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], inuse);
-    #endif
+#endif
   }
-  // mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
-  #endif
+  mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
+#endif
 
   /// pretend it is all free now
   mi_assert_internal(mi_page_thread_free(page) == NULL);
diff --git a/src/init.c b/src/init.c
index 3fc8b033..215eed20 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,12 +34,13 @@ const mi_page_t _mi_page_empty = {
   MI_ATOMIC_VAR_INIT(0), // xthread_free
   MI_ATOMIC_VAR_INIT(0), // xheap
   NULL, NULL
-  , { 0 }  // padding
+  #if MI_INTPTR_SIZE==4
+  , { NULL }
+  #endif
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
 
-#if (MI_SMALL_WSIZE_MAX==128)
 #if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #elif (MI_PADDING>0)
@@ -47,9 +48,7 @@ const mi_page_t _mi_page_empty = {
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
 #endif
-#else
-#error "define right initialization sizes corresponding to MI_SMALL_WSIZE_MAX"
-#endif
+
 
 // Empty page queues for every bin
 #define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
@@ -64,8 +63,8 @@ const mi_page_t _mi_page_empty = {
     QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
     QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
     QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
 
 #define MI_STAT_COUNT_NULL()  {0,0,0}
 
@@ -87,18 +86,6 @@ const mi_page_t _mi_page_empty = {
   { MI_INIT74(MI_STAT_COUNT_NULL) }, \
   { MI_INIT74(MI_STAT_COUNT_NULL) }
 
-
-// Empty slice span queues for every bin
-#define SQNULL(sz)  { NULL, NULL, sz }
-#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
-  { SQNULL(1), \
-    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
-    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
-    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
-    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
-    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
-
-
 // --------------------------------------------------------
 // Statically allocate an empty heap as the initial
 // thread local value for the default heap,
@@ -108,7 +95,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+mi_decl_hidden mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // tid
@@ -123,23 +110,12 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   false,            // can reclaim
   0,                // tag
   #if MI_GUARDED
-  0, 0, 0, 1,       // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
 };
 
-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
-
-mi_decl_cache_align static const mi_tld_t tld_empty = {
-  0,
-  false,
-  NULL, NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, tld_empty_stats }, // segments
-  { MI_STAT_VERSION, MI_STATS_NULL }       // stats
-};
 
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
@@ -150,10 +126,15 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 extern mi_decl_hidden mi_heap_t _mi_heap_main;
 
+static mi_decl_cache_align mi_subproc_t mi_subproc_default;
+
 static mi_decl_cache_align mi_tld_t tld_main = {
   0, false,
-  &_mi_heap_main, & _mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats }, // segments
+  &_mi_heap_main, &_mi_heap_main,
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, &mi_subproc_default,
+    &tld_main.stats
+  }, // segments
   { MI_STAT_VERSION, MI_STATS_NULL }       // stats
 };
 
@@ -172,7 +153,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   false,            // can reclaim
   0,                // tag
   #if MI_GUARDED
-  0, 0, 0, 0,
+  0, 0, 0, 0, 0,
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
@@ -184,14 +165,15 @@ mi_stats_t _mi_stats_main = { MI_STAT_VERSION, MI_STATS_NULL };
 
 #if MI_GUARDED
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
-  heap->guarded_sample_rate  = sample_rate;
-  heap->guarded_sample_count = sample_rate;  // count down samples
-  if (heap->guarded_sample_rate > 1) {
-    if (seed == 0) {
-      seed = _mi_heap_random_next(heap);
-    }
-    heap->guarded_sample_count = (seed % heap->guarded_sample_rate) + 1;  // start at random count between 1 and `sample_rate`
+  heap->guarded_sample_seed = seed;
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
   }
+  heap->guarded_sample_rate  = sample_rate;
+  if (heap->guarded_sample_rate >= 1) {
+    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
+  }
+  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
 }
 
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@@ -244,6 +226,7 @@ mi_heap_t* _mi_heap_main_get(void) {
   return &_mi_heap_main;
 }
 
+
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
@@ -317,6 +300,7 @@ static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
 
 static mi_thread_data_t* mi_thread_data_zalloc(void) {
   // try to find thread metadata in the cache
+  bool is_zero = false;
   mi_thread_data_t* td = NULL;
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
     td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
@@ -324,25 +308,32 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
       // found cached allocation, try use it
       td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
-        _mi_memzero(td, offsetof(mi_thread_data_t,memid));
-        return td;
+        break;
       }
     }
   }
 
   // if that fails, allocate as meta data
-  mi_memid_t memid;
-  td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
   if (td == NULL) {
-    // if this fails, try once more. (issue #257)
-    td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
+    mi_memid_t memid;
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
     if (td == NULL) {
-      // really out of memory
-      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      return NULL;
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      }
+    }
+    if (td != NULL) {
+      td->memid = memid;
+      is_zero = memid.initially_zero;
     }
   }
-  td->memid = memid;
+
+  if (td != NULL && !is_zero) {
+    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
+  }
   return td;
 }
 
@@ -400,7 +391,7 @@ static bool _mi_thread_heap_init(void) {
 
 // initialize thread local data
 void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t));
+  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->segments.subproc = &mi_subproc_default;
@@ -441,10 +432,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 
   // free if not the main thread
   if (heap != &_mi_heap_main) {
-    // the following assertion does not always hold for huge segments as those are always treated
-    // as abondened: one may allocate it in one thread, but deallocate in another in which case
-    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
-    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
     mi_thread_data_free((mi_thread_data_t*)heap);
   }
   else {
@@ -577,7 +565,7 @@ mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
 }
 
 // Called once by the process loader from `src/prim/prim.c`
-void _mi_auto_process_init(void) {
+void _mi_process_load(void) {
   mi_heap_main_init();
   #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
@@ -659,13 +647,13 @@ void mi_process_init(void) mi_attr_noexcept {
   if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
     long ksize = mi_option_get(mi_option_reserve_os_memory);
     if (ksize > 0) {
-      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
     }
   }
 }
 
-// Called when the process is done (cdecl as it is used with `at_exit` on some platforms)
-void mi_cdecl mi_process_done(void) mi_attr_noexcept {
+// Called when the process is done (through `at_exit`)
+void mi_cdecl _mi_process_done(void) {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -708,7 +696,3 @@ void mi_cdecl mi_process_done(void) mi_attr_noexcept {
   os_preloading = true; // don't call the C runtime anymore
 }
 
-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept {
-  if (_mi_option_get_fast(mi_option_destroy_on_exit)>1) return;
-  mi_process_done();
-}
diff --git a/src/libc.c b/src/libc.c
index 52d095eb..1bd97aa3 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -275,60 +275,3 @@ int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
   va_end(args);
   return written;
 }
-
-
-#if MI_SIZE_SIZE == 4
-#define mi_mask_even_bits32      (0x55555555)
-#define mi_mask_even_pairs32     (0x33333333)
-#define mi_mask_even_nibbles32   (0x0F0F0F0F)
-
-// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
-static size_t mi_byte_sum32(uint32_t x) {
-  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
-  x += (x << 8);
-  x += (x << 16);
-  return (size_t)(x >> 24);
-}
-
-static size_t mi_popcount_generic32(uint32_t x) {
-  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
-  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
-  // into the lower bit-pair:
-  x = x - ((x >> 1) & mi_mask_even_bits32);
-  // add the 2-bit pair results
-  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
-  // add the 4-bit nibble results
-  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
-  // each byte now has a count of its bits, we can sum them now:
-  return mi_byte_sum32(x);
-}
-
-mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
-  return mi_popcount_generic32(x);
-}
-
-#else
-#define mi_mask_even_bits64      (0x5555555555555555)
-#define mi_mask_even_pairs64     (0x3333333333333333)
-#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
-
-// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
-static size_t mi_byte_sum64(uint64_t x) {
-  x += (x << 8);
-  x += (x << 16);
-  x += (x << 32);
-  return (size_t)(x >> 56);
-}
-
-static size_t mi_popcount_generic64(uint64_t x) {
-  x = x - ((x >> 1) & mi_mask_even_bits64);
-  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
-  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
-  return mi_byte_sum64(x);
-}
-
-mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
-  return mi_popcount_generic64(x);
-}
-#endif
-
diff --git a/src/options.c b/src/options.c
index af2a0e70..772dfe66 100644
--- a/src/options.c
+++ b/src/options.c
@@ -106,11 +106,11 @@ typedef struct mi_option_desc_s {
 static mi_option_desc_t options[_mi_option_last] =
 {
   // stable options
-  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
+#if MI_DEBUG || defined(MI_SHOW_ERRORS)
   { 1, UNINIT, MI_OPTION(show_errors) },
-  #else
+#else
   { 0, UNINIT, MI_OPTION(show_errors) },
-  #endif
+#endif
   { 0, UNINIT, MI_OPTION(show_stats) },
   { MI_DEFAULT_VERBOSE, UNINIT, MI_OPTION(verbose) },
 
@@ -129,7 +129,7 @@ static mi_option_desc_t options[_mi_option_last] =
        UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
   { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
   { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(abandoned_page_purge) },       // purge free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
 #if defined(__NetBSD__)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed
@@ -425,14 +425,14 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 }
 
 static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return false;
   #endif
   return mi_recurse_enter_prim();
 }
 
 static void mi_recurse_exit(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return;
   #endif
   mi_recurse_exit_prim();
@@ -525,7 +525,7 @@ void _mi_warning_message(const char* fmt, ...) {
 
 
 #if MI_DEBUG
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) mi_attr_noexcept {
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
   _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
   abort();
 }
diff --git a/src/os.c b/src/os.c
index 9b1b4b46..12cc5da3 100644
--- a/src/os.c
+++ b/src/os.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -91,6 +91,21 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
 
 /* -----------------------------------------------------------
   aligned hinting
@@ -152,8 +167,8 @@ static void mi_os_free_huge_os_pages(void* p, size_t size);
 
 static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
   mi_assert_internal((size % _mi_os_page_size()) == 0);
-  if (addr == NULL) return; // || _mi_os_is_huge_reserved(addr)
-  int err = _mi_prim_free(addr, size);  // allow size==0 (issue #1041)
+  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
@@ -166,16 +181,15 @@ static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
   if (mi_memkind_is_os(memid.memkind)) {
     size_t csize = memid.mem.os.size;
-    if (csize==0) { csize = _mi_os_good_alloc_size(size); }
-    mi_assert_internal(csize >= size);
+    if (csize==0) { _mi_os_good_alloc_size(size); }
     size_t commit_size = (still_committed ? csize : 0);
     void* base = addr;
     // different base? (due to alignment)
     if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);
+      mi_assert(memid.mem.os.base <= addr);      
       base = memid.mem.os.base;
       const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) {
+      if (memid.mem.os.size==0) { 
         csize += diff;
       }
       if (still_committed) {
@@ -286,10 +300,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 
       // explicitly commit only the aligned part
       if (commit) {
-        if (!_mi_os_commit(p, size, NULL)) {
-          mi_os_prim_free(*base, over_size, 0);
-          return NULL;
-        }
+        _mi_os_commit(p, size, NULL);
       }
     }
     else  { // mmap can free inside an allocation
@@ -327,11 +338,9 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);  
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(memid->initially_committed);
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+  }
   return p;
 }
 
@@ -347,42 +356,15 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   bool os_is_zero  = false;
   void* os_base = NULL;
   void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
-  memid->mem.os.base = os_base;
-  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
-
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(_mi_is_aligned(p,alignment));
-  if (commit) { mi_assert_internal(memid->initially_committed); }  
-  return p;
-}
-
-
-mi_decl_nodiscard static void* mi_os_ensure_zero(void* p, size_t size, mi_memid_t* memid) {
-  if (p==NULL || size==0) return p;
-  // ensure committed
-  if (!memid->initially_committed) {
-    bool is_zero = false;
-    if (!_mi_os_commit(p, size, &is_zero)) {
-      _mi_os_free(p, size, *memid);
-      return NULL;
-    }
-    memid->initially_committed = true;
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    memid->mem.os.base = os_base;
+    // memid->mem.os.alignment = alignment;
+    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
-  // ensure zero'd
-  if (memid->initially_zero) return p;
-  _mi_memzero_aligned(p,size);
-  memid->initially_zero = true;
   return p;
 }
 
-void*  _mi_os_zalloc(size_t size, mi_memid_t* memid) {
-  void* p = _mi_os_alloc(size,memid);
-  return mi_os_ensure_zero(p, size, memid);
-}
-
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
   for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@@ -528,17 +510,6 @@ bool _mi_os_reset(void* addr, size_t size) {
 }
 
 
-void _mi_os_reuse( void* addr, size_t size ) {
-  // page align conservatively within the range
-  size_t csize = 0;
-  void* const start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return;
-  const int err = _mi_prim_reuse(start, csize);
-  if (err != 0) {
-    _mi_warning_message("cannot reuse OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
-  }
-}
-
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
@@ -548,7 +519,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
   mi_os_stat_increase(purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
-      !_mi_preloading())                                   // don't decommit during preloading (unsafe)
+    !_mi_preloading())                                     // don't decommit during preloading (unsafe)
   {
     bool needs_recommit = true;
     mi_os_decommit_ex(p, size, &needs_recommit, stat_size);
@@ -568,6 +539,7 @@ bool _mi_os_purge(void* p, size_t size) {
   return _mi_os_purge_ex(p, size, true, size);
 }
 
+
 // Protect a region in memory to be not accessible.
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   // page align conservatively within the range
@@ -646,7 +618,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
-  uint8_t* const start = mi_os_claim_huge_pages(pages, &size);
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
   if (start == NULL) return NULL; // or 32-bit systems
 
   // Allocate one page at the time but try to place them contiguously
@@ -702,7 +674,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
   if (page != 0) {
     mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(start, size, true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
     memid->memkind = MI_MEM_OS_HUGE;
     mi_assert(memid->is_pinned);
     #ifdef MI_TRACK_ASAN
@@ -724,47 +696,34 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) {
   }
 }
 
-
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 
-static _Atomic(size_t) mi_numa_node_count; // = 0   // cache the node count
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
 
-int _mi_os_numa_node_count(void) {
-  size_t count = mi_atomic_load_acquire(&mi_numa_node_count);
-  if mi_unlikely(count == 0) {
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
     long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0 && ncount < INT_MAX) {
+    if (ncount > 0) {
       count = (size_t)ncount;
     }
     else {
-      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
-      if (n == 0 || n > INT_MAX) { count = 1; }
-                            else { count = n; }
+      count = _mi_prim_numa_node_count(); // or detect dynamically
+      if (count == 0) count = 1;
     }
-    mi_atomic_store_release(&mi_numa_node_count, count); // save it
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
     _mi_verbose_message("using %zd numa regions\n", count);
   }
-  mi_assert_internal(count > 0 && count <= INT_MAX);
-  return (int)count;
+  return count;
 }
 
-static int mi_os_numa_node_get(void) {
-  int numa_count = _mi_os_numa_node_count();
+int _mi_os_numa_node_get(void) {
+  size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  const size_t n = _mi_prim_numa_node();
-  int numa_node = (n < INT_MAX ? (int)n : 0);
+  size_t numa_node = _mi_prim_numa_node();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return numa_node;
-}
-
-int _mi_os_numa_node(void) {
-  if mi_likely(mi_atomic_load_relaxed(&mi_numa_node_count) == 1) {
-    return 0;
-  }
-  else {
-    return mi_os_numa_node_get();
-  }
+  return (int)numa_node;
 }
diff --git a/src/page-queue.c b/src/page-queue.c
index c719b626..3507505d 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"
+#include "mimalloc.h"     
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
 }
 
 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
 }
 
 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
 }
 
 /* -----------------------------------------------------------
@@ -58,7 +58,7 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
 static inline size_t mi_bin(size_t size) {
-  size_t wsize = _mi_wsize_from_size(size);
+  size_t wsize = _mi_wsize_from_size(size);  
 #if defined(MI_ALIGN4W)
   if mi_likely(wsize <= 4) {
     return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
@@ -72,7 +72,7 @@ static inline size_t mi_bin(size_t size) {
     return (wsize == 0 ? 1 : wsize);
   }
 #endif
-  else if mi_unlikely(wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
+  else if mi_unlikely(wsize > MI_LARGE_OBJ_WSIZE_MAX) {
     return MI_BIN_HUGE;
   }
   else {
@@ -107,7 +107,7 @@ size_t _mi_bin_size(size_t bin) {
 
 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
     return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
@@ -136,11 +136,7 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
-static inline bool mi_page_is_large_or_huge(const mi_page_t* page) {
-  return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page));
-}
-
-size_t _mi_page_bin(const mi_page_t* page) {
+static size_t mi_page_bin(const mi_page_t* page) {
   const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
   mi_assert_internal(bin <= MI_BIN_FULL);
   return bin;
@@ -148,10 +144,10 @@ size_t _mi_page_bin(const mi_page_t* page) {
 
 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
   mi_assert_internal(heap!=NULL);
-  const size_t bin = _mi_page_bin(page);
+  const size_t bin = mi_page_bin(page);
   mi_page_queue_t* pq = &heap->pages[bin];
   mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
-                       (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
                          (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
   return pq;
 }
@@ -214,11 +210,10 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
-
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == queue->last)  queue->last = page->prev;
@@ -240,10 +235,10 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
   #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -282,8 +277,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
   mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
                      (bsize == to->block_size && mi_page_queue_is_full(from)) ||
                      (bsize == from->block_size && mi_page_queue_is_full(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to)));
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
 
@@ -322,8 +317,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
       page->prev = to->first;
       page->next = next;
       to->first->next = page;
-      if (next != NULL) {
-        next->prev = page;
+      if (next != NULL) { 
+        next->prev = page; 
       }
       else {
         to->last = page;
diff --git a/src/page.c b/src/page.c
index a5a10503..6a693e89 100644
--- a/src/page.c
+++ b/src/page.c
@@ -37,7 +37,7 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
 }
 
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
@@ -82,9 +82,11 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
+  // const size_t bsize = mi_page_block_size(page);
+  mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL));
-  mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -112,7 +114,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
@@ -121,15 +123,14 @@ bool _mi_page_is_valid(mi_page_t* page) {
   #endif
   if (mi_page_heap(page)!=NULL) {
     mi_segment_t* segment = _mi_page_segment(page);
-
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
     #if MI_HUGE_PAGE_ABANDON
-    if (segment->kind != MI_SEGMENT_HUGE)
+    if (segment->page_kind != MI_PAGE_HUGE)
     #endif
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
       mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
   }
@@ -256,11 +257,10 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
-
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
   #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
 
   // TODO: push on full queue immediately if it is full?
@@ -274,7 +274,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if !MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq != NULL);
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
   #endif
   mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
   if (page == NULL) {
@@ -284,14 +284,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   // a fresh page was found, initialize it
   const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
   mi_assert_internal(full_block_size >= block_size);
   mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
-  mi_heap_stat_increase(heap, page_bins[_mi_page_bin(page)], 1);
+  mi_heap_stat_increase(heap, page_bins[mi_page_bin(page)], 1);
   if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
@@ -427,7 +426,6 @@ void _mi_page_force_abandon(mi_page_t* page) {
   }
 }
 
-
 // Free a page with no more free blocks
 void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_assert_internal(page != NULL);
@@ -445,12 +443,13 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_segments_tld_t* segments_tld = &heap->tld->segments;
   mi_page_queue_remove(pq, page);
 
-  // and free it  
+  // and free it
+  mi_heap_stat_decrease(heap, page_bins[mi_page_bin(page)], 1);
   mi_page_set_heap(page,NULL);
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
@@ -624,7 +623,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 #if (MI_SECURE>0)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
-#define MI_MIN_EXTEND         (4)
+#define MI_MIN_EXTEND         (1)
 #endif
 
 // Extend the capacity (up to reserved) by initializing a free list
@@ -632,15 +631,18 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
   mi_assert_expensive(mi_page_is_valid_init(page));
   #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  if (page->free != NULL) return true;
+  if (page->free != NULL) return;
   #endif
-  if (page->capacity >= page->reserved) return true;
+  if (page->capacity >= page->reserved) return;
 
+  size_t page_size;
+  //uint8_t* page_start =
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
@@ -672,7 +674,6 @@ static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   page->capacity += (uint16_t)extend;
   mi_stat_increase(tld->stats.page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
-  return true;
 }
 
 // Initialize a fresh page
@@ -687,8 +688,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   size_t page_size;
   page->page_start = _mi_segment_page_start(segment, page, &page_size);
   mi_track_mem_noaccess(page->page_start,page_size);
-  mi_assert_internal(mi_page_block_size(page) <= page_size);
-  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -703,7 +702,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
   }
   #endif
-  mi_assert_internal(page->is_committed);
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
@@ -727,10 +725,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  if (mi_page_extend_free(heap,page,tld)) {
-    mi_assert(mi_page_immediate_available(page));
-  }
-  return;
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
 }
 
 
@@ -822,18 +818,13 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   if (page_candidate != NULL) {
     page = page_candidate;
   }
-  if (page != NULL) {
-    if (!mi_page_immediate_available(page)) {
-      mi_assert_internal(mi_page_is_expandable(page));
-      if (!mi_page_extend_free(heap, page, heap->tld)) {
-        page = NULL; // failed to extend
-      }
-    }
-    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  if (page != NULL && !mi_page_immediate_available(page)) {
+    mi_assert_internal(mi_page_is_expandable(page));
+    mi_page_extend_free(heap, page, heap->tld);
   }
 
   if (page == NULL) {
-    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available
     page = mi_page_fresh(heap, pq);
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
@@ -911,47 +902,31 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// Large and huge page allocation.
-// Huge pages contain just one block, and the segment contains just that page (as `MI_SEGMENT_HUGE`).
+// Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
   mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
-  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
   #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_LARGE_OBJ_SIZE_MAX+1 : block_size);
-  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
   if (page != NULL) {
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-
-    if (is_huge) {
-      mi_assert_internal(mi_page_is_huge(page));
-      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
-      mi_assert_internal(_mi_page_segment(page)->used==1);
-      #if MI_HUGE_PAGE_ABANDON
-      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-      mi_page_set_heap(page, NULL);
-      #endif
-    }
-    else {
-      mi_assert_internal(!mi_page_is_huge(page));
-    }
-
-    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
-    /*if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, malloc_large, bsize);
-      mi_heap_stat_counter_increase(heap, malloc_large_count, 1);
-    }
-    else */
-    {
-      _mi_stat_increase(&heap->tld->stats.malloc_huge, bsize);
-      _mi_stat_counter_increase(&heap->tld->stats.malloc_huge_count, 1);
-    }
+    mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
+    mi_assert_internal(_mi_page_segment(page)->used==1);
+    #if MI_HUGE_PAGE_ABANDON
+    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_page_set_heap(page, NULL);
+    #endif
+    mi_heap_stat_increase(heap, malloc_huge, mi_page_block_size(page));
+    mi_heap_stat_counter_increase(heap, malloc_huge_count, 1);
   }
   return page;
 }
@@ -962,13 +937,13 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
     }
     else {
-      return mi_large_huge_page_alloc(heap,size,huge_alignment);
+      return mi_huge_page_alloc(heap,size,huge_alignment);
     }
   }
   else {
@@ -1004,9 +979,9 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
     // free delayed frees from other threads (but skip contended ones)
     _mi_heap_delayed_free_partial(heap);
-
+    
     // collect every once in a while (10000 by default)
-    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);    
     if (heap->generic_collect_count >= generic_collect) {
       heap->generic_collect_count = 0;
       mi_heap_collect(heap, false /* force? */);
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index c4cfc35d..82147de7 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen, Alon Zakai
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -58,7 +58,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config) {
 extern void emmalloc_free(void*);
 
 int _mi_prim_free(void* addr, size_t size) {
-  if (size==0) return 0;
+  MI_UNUSED(size);
   emmalloc_free(addr);
   return 0;
 }
@@ -114,11 +114,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
diff --git a/src/prim/prim.c b/src/prim/prim.c
index 5147bae8..2002853f 100644
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@@ -39,29 +39,29 @@ terms of the MIT license. A copy of the license can be found in the file
     #define mi_attr_destructor  __attribute__((destructor))
   #endif
   static void mi_attr_constructor mi_process_attach(void) {
-    _mi_auto_process_init();
+    _mi_process_load();
   }
   static void mi_attr_destructor mi_process_detach(void) {
-    _mi_auto_process_done();
+    _mi_process_done();
   }
 #elif defined(__cplusplus)
   // C++: use static initialization to detect process start/end
   // This is not guaranteed to be first/last but the best we can generally do?
   struct mi_init_done_t {
     mi_init_done_t() {
-      _mi_auto_process_init();
+      _mi_process_load();
     }
     ~mi_init_done_t() {
-      _mi_auto_process_done();
+      _mi_process_done();
     }
   };
   static mi_init_done_t mi_init_done;
  #else
-  #pragma message("define a way to call _mi_auto_process_init/done on your platform")
+  #pragma message("define a way to call _mi_process_load/done on your platform")
 #endif
 #endif
 
-// Generic allocator init/done callback
+// Generic allocator init/done callback 
 #ifndef MI_PRIM_HAS_ALLOCATOR_INIT
 bool _mi_is_redirected(void) {
   return false;
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 650aa657..8e3180e6 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -31,12 +31,11 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #if defined(__linux__)
   #include <features.h>
-  #include <sys/prctl.h>    // THP disable, PR_SET_VMA
-  #if defined(__GLIBC__) && !defined(PR_SET_VMA)
-  #include <linux/prctl.h>
-  #endif
+  //#if defined(MI_NO_THP)
+  #include <sys/prctl.h>  // THP disable
+  //#endif
   #if defined(__GLIBC__)
-  #include <linux/mman.h>   // linux mmap flags
+  #include <linux/mman.h> // linux mmap flags
   #else
   #include <sys/mman.h>
   #endif
@@ -70,8 +69,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MADV_FREE  POSIX_MADV_FREE
 #endif
 
-#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
-
+  
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is
@@ -157,7 +155,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     }
     #endif
   }
-  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
+  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
   config->has_overcommit = unix_detect_overcommit();
   config->has_partial_free = true;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@@ -187,7 +185,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
 //---------------------------------------------
 
 int _mi_prim_free(void* addr, size_t size ) {
-  if (size==0) return 0;
   bool err = (munmap(addr, size) == -1);
   return (err ? errno : 0);
 }
@@ -208,24 +205,14 @@ static int unix_madvise(void* addr, size_t size, int advice) {
   return (res==0 ? 0 : errno);
 }
 
-static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
-  void* p = mmap(addr, size, protect_flags, flags, fd, 0 /* offset */);
-  #if defined(__linux__) && defined(PR_SET_VMA)
-  if (p!=MAP_FAILED && p!=NULL) {
-    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, size, "mimalloc");
-  }
-  #endif
-  return p;
-}
-
-static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
   MI_UNUSED(try_alignment);
   void* p = NULL;
   #if defined(MAP_ALIGNED)  // BSD
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
     size_t n = mi_bsr(try_alignment);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      p = unix_mmap_prim(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd);
+      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
       if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         int err = errno;
         _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
@@ -236,7 +223,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
   }
   #elif defined(MAP_ALIGN)  // Solaris
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    p = unix_mmap_prim((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd);  // addr parameter is the required alignment
+    p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
     if (p!=MAP_FAILED) return p;
     // fall back to regular mmap
   }
@@ -246,7 +233,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
   if (addr == NULL) {
     void* hint = _mi_os_get_aligned_hint(try_alignment, size);
     if (hint != NULL) {
-      p = unix_mmap_prim(hint, size, protect_flags, flags, fd);
+      p = mmap(hint, size, protect_flags, flags, fd, 0);
       if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
         int err = 0;
@@ -261,7 +248,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
   }
   #endif
   // regular mmap
-  p = unix_mmap_prim(addr, size, protect_flags, flags, fd);
+  p = mmap(addr, size, protect_flags, flags, fd, 0);
   if (p!=MAP_FAILED) return p;
   // failed to allocate
   return NULL;
@@ -332,7 +319,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       if (large_only || lflags != flags) {
         // try large OS page allocation
         *is_large = true;
-        p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+        p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
@@ -340,7 +327,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
             _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
           }
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+          p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         }
         #endif
         if (large_only) return p;
@@ -353,7 +340,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
   // regular allocation
   if (p == NULL) {
     *is_large = false;
-    p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, flags, fd);
+    p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd);
     if (p != NULL) {
       #if defined(MADV_HUGEPAGE)
       // Many Linux systems don't allow MAP_HUGETLB but they support instead
@@ -387,9 +374,6 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
-  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
-    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
-  }
 
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@@ -414,6 +398,10 @@ static void unix_mprotect_hint(int err) {
   #endif
 }
 
+
+
+
+
 int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   // commit: ensure we can access the area
   // note: we may think that *is_zero can be true since the memory
@@ -429,25 +417,11 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   return err;
 }
 
-int _mi_prim_reuse(void* start, size_t size) {
-  MI_UNUSED(start); MI_UNUSED(size);
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSE)
-  return unix_madvise(start, size, MADV_FREE_REUSE);
-  #endif
-  return 0;
-}
-
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   int err = 0;
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
-    // decommit on macOS: use MADV_FREE_REUSABLE as it does immediate rss accounting (issue #1097)
-    err = unix_madvise(start, size, MADV_FREE_REUSABLE);
-    if (err) { err = unix_madvise(start, size, MADV_DONTNEED); }
-  #else
-    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-    err = unix_madvise(start, size, MADV_DONTNEED);
-  #endif  
-  #if !MI_DEBUG && MI_SECURE<=2
+  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+  err = unix_madvise(start, size, MADV_DONTNEED);
+  #if !MI_DEBUG && !MI_SECURE
     *needs_recommit = false;
   #else
     *needs_recommit = true;
@@ -464,22 +438,14 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
 }
 
 int _mi_prim_reset(void* start, size_t size) {
-  int err = 0;
-
-  // on macOS can use MADV_FREE_REUSABLE (but we disable this for now as it seems slower)
-  #if 0 && defined(__APPLE__) && defined(MADV_FREE_REUSABLE) 
-  err = unix_madvise(start, size, MADV_FREE_REUSABLE);  
-  if (err==0) return 0;
-  // fall through
-  #endif
-
-  #if defined(MADV_FREE)
-  // Otherwise, we try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
   // will not reduce the `rss` stats in tools like `top` even though the memory is available
   // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
   // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
   while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
   if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
@@ -487,7 +453,7 @@ int _mi_prim_reset(void* start, size_t size) {
     err = unix_madvise(start, size, MADV_DONTNEED);
   }
   #else
-  err = unix_madvise(start, size, MADV_DONTNEED);
+  int err = unix_madvise(start, size, MADV_DONTNEED);
   #endif
   return err;
 }
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index 745a41fd..e1e7de5e 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -149,11 +149,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index eebdc4a6..a080f4bc 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -12,10 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr
 
-// xbox has no console IO
-#if !defined(WINAPI_FAMILY_PARTITION) || WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
-#define MI_HAS_CONSOLE_IO
-#endif
 
 //---------------------------------------------
 // Dynamically bind Windows API points for portability
@@ -49,30 +45,22 @@ typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
 #define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
 
 #include <winternl.h>
-typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef LONG  (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);  // avoid NTSTATUS as it is not defined on xbox (pr #1084)
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7  (and GetNumaNodeProcessorMask is not supported on xbox)
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
 
 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
 typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
 typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMask)(UCHAR Node, PULONGLONG ProcessorMask);
-typedef BOOL (__stdcall* PGetNumaHighestNodeNumber)(PULONG Node);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
 static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
-static PGetNumaNodeProcessorMask    pGetNumaNodeProcessorMask = NULL;
-static PGetNumaHighestNodeNumber    pGetNumaHighestNodeNumber = NULL;
-
-// Not available on xbox
-typedef SIZE_T(__stdcall* PGetLargePageMinimum)(VOID);
-static PGetLargePageMinimum pGetLargePageMinimum = NULL;
 
 // Available after Windows XP
 typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes );
@@ -86,7 +74,6 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
   static bool large_initialized = false;
   if (large_initialized) return (_mi_os_large_page_size() > 0);
   large_initialized = true;
-  if (pGetLargePageMinimum==NULL) return false;  // no large page support (xbox etc.)
 
   // Try to see if large OS pages are supported
   // To use large pages on Windows, we first need access permission
@@ -105,8 +92,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
       if (ok) {
         err = GetLastError();
         ok = (err == ERROR_SUCCESS);
-        if (ok && large_page_size != NULL && pGetLargePageMinimum != NULL) {
-          *large_page_size = (*pGetLargePageMinimum)();
+        if (ok && large_page_size != NULL) {
+          *large_page_size = GetLargePageMinimum();
         }
       }
     }
@@ -162,9 +149,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
     pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
     pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
-    pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMask");
-    pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)(void (*)(void))GetProcAddress(hDll, "GetNumaHighestNodeNumber");
-    pGetLargePageMinimum = (PGetLargePageMinimum)(void (*)(void))GetProcAddress(hDll, "GetLargePageMinimum");
     // Get physical memory (not available on XP, so check dynamically)
     PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory");
     if (pGetPhysicallyInstalledSystemMemory != NULL) {
@@ -368,11 +352,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return (p != NULL ? 0 : (int)GetLastError());
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   DWORD oldprotect = 0;
   BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
@@ -404,7 +383,7 @@ static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int num
     }
     SIZE_T psize = size;
     void* base = hint_addr;
-    LONG err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
     if (err == 0 && base != NULL) {
       return base;
     }
@@ -458,11 +437,9 @@ size_t _mi_prim_numa_node(void) {
 
 size_t _mi_prim_numa_node_count(void) {
   ULONG numa_max = 0;
-  if (pGetNumaHighestNodeNumber!=NULL) {
-    (*pGetNumaHighestNodeNumber)(&numa_max);
-  }
+  GetNumaHighestNodeNumber(&numa_max);
   // find the highest node number that has actual processors assigned to it. Issue #282
-  while (numa_max > 0) {
+  while(numa_max > 0) {
     if (pGetNumaNodeProcessorMaskEx != NULL) {
       // Extended API is supported
       GROUP_AFFINITY affinity;
@@ -473,10 +450,8 @@ size_t _mi_prim_numa_node_count(void) {
     else {
       // Vista or earlier, use older API that is limited to 64 processors.
       ULONGLONG mask;
-      if (pGetNumaNodeProcessorMask != NULL) {
-        if ((*pGetNumaNodeProcessorMask)((UCHAR)numa_max, &mask)) {
-          if (mask != 0) break; // found the maximum non-empty node
-        }
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
       };
     }
     // max node was invalid or had no processor assigned, try again
@@ -566,21 +541,17 @@ void _mi_prim_out_stderr( const char* msg )
   if (!_mi_preloading()) {
     // _cputs(msg);  // _cputs cannot be used as it aborts when failing to lock the console
     static HANDLE hcon = INVALID_HANDLE_VALUE;
-    static bool hconIsConsole = false;
+    static bool hconIsConsole;
     if (hcon == INVALID_HANDLE_VALUE) {
-      hcon = GetStdHandle(STD_ERROR_HANDLE);
-      #ifdef MI_HAS_CONSOLE_IO
       CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
       hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
-      #endif  
     }
     const size_t len = _mi_strlen(msg);
     if (len > 0 && len < UINT32_MAX) {
       DWORD written = 0;
       if (hconIsConsole) {
-        #ifdef MI_HAS_CONSOLE_IO
         WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
-        #endif      
       }
       else if (hcon != INVALID_HANDLE_VALUE) {
         // use direct write if stderr was redirected
@@ -656,47 +627,19 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Process & Thread Init/Done
 //----------------------------------------------------------------
 
-#if MI_WIN_USE_FIXED_TLS==1
-mi_decl_cache_align size_t _mi_win_tls_offset = 0;
-#endif
-
-//static void mi_debug_out(const char* s) {
-//  HANDLE h = GetStdHandle(STD_ERROR_HANDLE);
-//  WriteConsole(h, s, (DWORD)_mi_strlen(s), NULL, NULL);
-//}
-
-static void mi_win_tls_init(DWORD reason) {
-  if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) {
-    #if MI_WIN_USE_FIXED_TLS==1  // we must allocate a TLS slot dynamically
-    if (_mi_win_tls_offset == 0 && reason == DLL_PROCESS_ATTACH) {
-      const DWORD tls_slot = TlsAlloc();  // usually returns slot 1
-      if (tls_slot == TLS_OUT_OF_INDEXES) {
-        _mi_error_message(EFAULT, "unable to allocate the a TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n");
-      }
-      _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*);
-    }
-    #endif
-    #if MI_HAS_TLS_SLOT >= 2  // we must initialize the TLS slot before any allocation
-    if (mi_prim_get_default_heap() == NULL) {
-      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
-      #if MI_DEBUG && MI_WIN_USE_FIXED_TLS==1
-      void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*)));
-      mi_assert_internal(p == (void*)&_mi_heap_empty);
-      #endif
-    }
-    #endif
-  }
-}
-
 static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   MI_UNUSED(reserved);
   MI_UNUSED(module);
-  mi_win_tls_init(reason);
+  #if MI_TLS_SLOT >= 2
+  if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+    _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+  }
+  #endif
   if (reason==DLL_PROCESS_ATTACH) {
-    _mi_auto_process_init();
+    _mi_process_load();
   }
   else if (reason==DLL_PROCESS_DETACH) {
-    _mi_auto_process_done();
+    _mi_process_done();
   }
   else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
     _mi_thread_done(NULL);
@@ -786,7 +729,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
 
     static int mi_process_attach(void) {
       mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
-      atexit(&_mi_auto_process_done);
+      atexit(&_mi_process_done);
       return 0;
     }
     typedef int(*mi_crt_callback_t)(void);
@@ -853,7 +796,11 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   #endif
   mi_decl_export void _mi_redirect_entry(DWORD reason) {
     // called on redirection; careful as this may be called before DllMain
-    mi_win_tls_init(reason);
+    #if MI_TLS_SLOT >= 2
+    if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+    }
+    #endif
     if (reason == DLL_PROCESS_ATTACH) {
       mi_redirected = true;
     }
diff --git a/src/random.c b/src/random.c
index f17698ba..4fc8b2f8 100644
--- a/src/random.c
+++ b/src/random.c
@@ -143,17 +143,13 @@ void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
 
 uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
   mi_assert_internal(mi_random_is_initialized(ctx));
-  uintptr_t r;
-  do {
-    #if MI_INTPTR_SIZE <= 4
-    r = chacha_next32(ctx);
-    #elif MI_INTPTR_SIZE == 8
-    r = (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
-    #else
-    # error "define mi_random_next for this platform"
-    #endif
-  } while (r==0);
-  return r;
+  #if MI_INTPTR_SIZE <= 4
+    return chacha_next32(ctx);
+  #elif MI_INTPTR_SIZE == 8
+    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+  #else
+  # error "define mi_random_next for this platform"
+  #endif
 }
 
 
@@ -167,7 +163,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   x ^= _mi_prim_clock_now();  
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
-  for (uintptr_t i = 0; i < max || x==0; i++, x++) {
+  for (uintptr_t i = 0; i < max; i++) {
     x = _mi_random_shuffle(x);
   }
   mi_assert_internal(x != 0);
@@ -183,7 +179,7 @@ static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
     if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
     #endif
     uintptr_t x = _mi_os_random_weak(0);
-    for (size_t i = 0; i < 8; i++, x++) {  // key is eight 32-bit words.
+    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
     }
diff --git a/src/segment-map.c b/src/segment-map.c
index bbcea28a..2f68f8c4 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -61,7 +61,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo
   if mi_unlikely(part == NULL) {
     if (!create_on_demand) return NULL;
     mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_zalloc(sizeof(mi_segmap_part_t), &memid);
+    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
     if (part == NULL) return NULL;
     part->memid = memid;
     mi_segmap_part_t* expected = NULL;
diff --git a/src/segment.c b/src/segment.c
index 32841e6d..e2730b7f 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -11,153 +11,20 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memset
 #include <stdio.h>
 
-// -------------------------------------------------------------------
-// Segments
-// mimalloc pages reside in segments. See `mi_segment_valid` for invariants.
-// -------------------------------------------------------------------
-
-
-static void mi_segment_try_purge(mi_segment_t* segment, bool force);
-
-
-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
-  }
-  return true;
-}
-
-static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
-  }
-  return false;
-}
-
-static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] = (commit->mask[i] & cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] &= ~(cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] |= cm->mask[i];
-  }
-}
-
-static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  if (bitcount == MI_COMMIT_MASK_BITS) {
-    mi_assert_internal(bitidx==0);
-    mi_commit_mask_create_full(cm);
-  }
-  else if (bitcount == 0) {
-    mi_commit_mask_create_empty(cm);
-  }
-  else {
-    mi_commit_mask_create_empty(cm);
-    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
-    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
-    while (bitcount > 0) {
-      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
-      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
-      size_t count = (bitcount > avail ? avail : bitcount);
-      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
-      cm->mask[i] = mask;
-      bitcount -= count;
-      ofs = 0;
-      i++;
-    }
-  }
-}
-
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
-  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
-  size_t count = 0;
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    size_t mask = cm->mask[i];
-    if (~mask == 0) {
-      count += MI_COMMIT_MASK_FIELD_BITS;
-    }
-    else {
-      for (; mask != 0; mask >>= 1) {  // todo: use popcount
-        if ((mask&1)!=0) count++;
-      }
-    }
-  }
-  // we use total since for huge segments each commit bit may represent a larger size
-  return ((total / MI_COMMIT_MASK_BITS) * count);
-}
-
-
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
-  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
-  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
-  size_t mask = 0;
-  // find first ones
-  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
-    mask = cm->mask[i];
-    mask >>= ofs;
-    if (mask != 0) {
-      while ((mask&1) == 0) {
-        mask >>= 1;
-        ofs++;
-      }
-      break;
-    }
-    i++;
-    ofs = 0;
-  }
-  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
-    // not found
-    *idx = MI_COMMIT_MASK_BITS;
-    return 0;
-  }
-  else {
-    // found, count ones
-    size_t count = 0;
-    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
-    do {
-      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
-      do {
-        count++;
-        mask >>= 1;
-      } while ((mask&1) == 1);
-      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
-        i++;
-        if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
-        mask = cm->mask[i];
-        ofs = 0;
-      }
-    } while ((mask&1) == 1);
-    mi_assert_internal(count > 0);
-    return count;
-  }
-}
+#define MI_PAGE_HUGE_ALIGN  (256*1024)
 
+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
 
 /* --------------------------------------------------------------------------------
   Segment allocation
-  We allocate pages inside bigger "segments" (32 MiB on 64-bit). This is to avoid
+  We allocate pages inside bigger "segments" (4MiB on 64-bit). This is to avoid
   splitting VMA's on Linux and reduce fragmentation on other OS's.
   Each thread owns its own segments.
 
   Currently we have:
-  - small pages (64KiB)
-  - medium pages (512KiB)
-  - large pages (4MiB),
+  - small pages (64KiB), 64 in one segment
+  - medium pages (512KiB), 8 in one segment
+  - large pages (4MiB), 1 in one segment
   - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
     it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 
@@ -171,84 +38,74 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
 
 
 /* -----------------------------------------------------------
-   Slices
+  Queue of segments containing free pages
 ----------------------------------------------------------- */
 
+#if (MI_DEBUG>=3)
+static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, const mi_segment_t* segment) {
+  mi_assert_internal(segment != NULL);
+  mi_segment_t* list = queue->first;
+  while (list != NULL) {
+    if (list == segment) break;
+    mi_assert_internal(list->next==NULL || list->next->prev == list);
+    mi_assert_internal(list->prev==NULL || list->prev->next == list);
+    list = list->next;
+  }
+  return (list == segment);
+}
+#endif
 
-static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
-  return &segment->slices[segment->slice_entries];
+/*
+static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(mi_segment_queue_contains(queue, segment));
+  if (segment->prev != NULL) segment->prev->next = segment->next;
+  if (segment->next != NULL) segment->next->prev = segment->prev;
+  if (segment == queue->first) queue->first = segment->next;
+  if (segment == queue->last)  queue->last = segment->prev;
+  segment->next = NULL;
+  segment->prev = NULL;
 }
 
-static uint8_t* mi_slice_start(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
-  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
+static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(!mi_segment_queue_contains(queue, segment));
+  segment->next = NULL;
+  segment->prev = queue->last;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = segment;
+    queue->last = segment;
+  }
+  else {
+    queue->last = queue->first = segment;
+  }
 }
 
-
-/* -----------------------------------------------------------
-   Bins
------------------------------------------------------------ */
-// Use bit scan forward to quickly find the first zero bit if it is available
-
-static inline size_t mi_slice_bin8(size_t slice_count) {
-  if (slice_count<=1) return slice_count;
-  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
-  slice_count--;
-  size_t s = mi_bsr(slice_count);  // slice_count > 1
-  if (s <= 2) return slice_count + 1;
-  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
-  return bin;
+static mi_segment_queue_t* mi_segment_free_queue_of_kind(mi_page_kind_t kind, mi_segments_tld_t* tld) {
+  if (kind == MI_PAGE_SMALL) return &tld->small_free;
+  else if (kind == MI_PAGE_MEDIUM) return &tld->medium_free;
+  else return NULL;
 }
 
-static inline size_t mi_slice_bin(size_t slice_count) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
-  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
-  size_t bin = mi_slice_bin8(slice_count);
-  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
-  return bin;
+static mi_segment_queue_t* mi_segment_free_queue(const mi_segment_t* segment, mi_segments_tld_t* tld) {
+  return mi_segment_free_queue_of_kind(segment->page_kind, tld);
 }
 
-static inline size_t mi_slice_index(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  ptrdiff_t index = slice - segment->slices;
-  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
-  return index;
+// remove from free queue if it is in one
+static void mi_segment_remove_from_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld); // may be NULL
+  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
+  if (in_queue) {
+    mi_segment_queue_remove(queue, segment);
+  }
 }
 
-
-/* -----------------------------------------------------------
-   Slice span queues
------------------------------------------------------------ */
-
-static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
-  // todo: or push to the end?
-  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
-  slice->prev = NULL; // paranoia
-  slice->next = sq->first;
-  sq->first = slice;
-  if (slice->next != NULL) slice->next->prev = slice;
-                     else sq->last = slice;
-  slice->block_size = 0; // free
-}
-
-static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
-  size_t bin = mi_slice_bin(slice_count);
-  mi_span_queue_t* sq = &tld->spans[bin];
-  mi_assert_internal(sq->slice_count >= slice_count);
-  return sq;
-}
-
-static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
-  mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  // should work too if the queue does not contain slice (which can happen during reclaim)
-  if (slice->prev != NULL) slice->prev->next = slice->next;
-  if (slice == sq->first) sq->first = slice->next;
-  if (slice->next != NULL) slice->next->prev = slice->prev;
-  if (slice == sq->last) sq->last = slice->prev;
-  slice->prev = NULL;
-  slice->next = NULL;
-  slice->block_size = 1; // no more free
+static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_segment_enqueue(mi_segment_free_queue(segment, tld), segment);
 }
 
 
@@ -256,136 +113,366 @@ static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
  Invariant checking
 ----------------------------------------------------------- */
 
-static bool mi_slice_is_used(const mi_slice_t* slice) {
-  return (slice->block_size > 0);
+#if (MI_DEBUG >= 2) || (MI_SECURE >= 2)
+static size_t mi_segment_page_size(const mi_segment_t* segment) {
+  if (segment->capacity > 1) {
+    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
+    return ((size_t)1 << segment->page_shift);
+  }
+  else {
+    mi_assert_internal(segment->page_kind >= MI_PAGE_LARGE);
+    return segment->segment_size;
+  }
 }
+#endif
 
-
-#if (MI_DEBUG>=3)
-static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
-  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
-    if (s==slice) return true;
+#if (MI_DEBUG>=2)
+static bool mi_pages_purge_contains(const mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_page_t* p = tld->pages_purge.first;
+  while (p != NULL) {
+    if (p == page) return true;
+    p = p->next;
   }
   return false;
 }
+#endif
 
-static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
+#if (MI_DEBUG>=3)
+static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment != NULL);
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // can only decommit committed blocks
-  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  size_t used_count = 0;
-  mi_span_queue_t* sq;
-  while(slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    size_t index = mi_slice_index(slice);
-    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
-    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET_COUNT valid back offsets
-      used_count++;
-      mi_assert_internal(slice->is_huge == (segment->kind == MI_SEGMENT_HUGE));
-      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET_COUNT && index + i <= maxindex; i++) {
-        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
-        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
-        mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1);
-      }
-      // and the last entry as well (for coalescing)
-      const mi_slice_t* last = slice + slice->slice_count - 1;
-      if (last > slice && last < mi_segment_slices_end(segment)) {
-        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
-        mi_assert_internal(last->slice_count == 0);
-        mi_assert_internal(last->block_size == 1);
-      }
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment
+  size_t nfree = 0;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    const mi_page_t* const page = &segment->pages[i];
+    if (!page->segment_in_use) {
+      nfree++;
     }
-    else {  // free range of slices; only last slice needs a valid back offset
-      mi_slice_t* last = &segment->slices[maxindex];
-      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
-        mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
-      }
-      mi_assert_internal(slice == last || last->slice_count == 0 );
-      mi_assert_internal(last->block_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->block_size==1));
-      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
-        sq = mi_span_queue_for(slice->slice_count,tld);
-        mi_assert_internal(mi_span_queue_contains(sq,slice));
-      }
+    if (page->segment_in_use) {
+      mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
-    slice = &segment->slices[maxindex+1];
+    mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   }
-  mi_assert_internal(slice == end);
-  mi_assert_internal(used_count == segment->used + 1);
+  mi_assert_internal(nfree + segment->used == segment->capacity);
+  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
+                     (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
   return true;
 }
 #endif
 
+static bool mi_page_not_in_queue(const mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(page != NULL);
+  if (page->next != NULL || page->prev != NULL) {
+    mi_assert_internal(mi_pages_purge_contains(page, tld));
+    return false;
+  }
+  else {
+    // both next and prev are NULL, check for singleton list
+    return (tld->pages_purge.first != page && tld->pages_purge.last != page);
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Guard pages
+----------------------------------------------------------- */
+
+static void mi_segment_protect_range(void* p, size_t size, bool protect) {
+  if (protect) {
+    _mi_os_protect(p, size);
+  }
+  else {
+    _mi_os_unprotect(p, size);
+  }
+}
+
+static void mi_segment_protect(mi_segment_t* segment, bool protect) {
+  // add/remove guard pages
+  if (MI_SECURE != 0) {
+    // in secure mode, we set up a protected page in between the segment info and the page data
+    const size_t os_psize = _mi_os_page_size();
+    mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
+    mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0);
+    mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect);
+    #if (MI_SECURE >= 2)
+    if (segment->capacity == 1)
+    #endif
+    {
+      // and protect the last (or only) page too
+      mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE);
+      uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize;
+      if (protect && !segment->memid.initially_committed) {
+        if (protect) {
+          // ensure secure page is committed
+          if (_mi_os_commit(start, os_psize, NULL)) {  // if this fails that is ok (as it is an unaccessible page)
+            mi_segment_protect_range(start, os_psize, protect);
+          }
+        }
+      }
+      else {
+        mi_segment_protect_range(start, os_psize, protect);
+      }
+    }
+    #if (MI_SECURE >= 2)
+    else {
+      // or protect every page
+      const size_t page_size = mi_segment_page_size(segment);
+      for (size_t i = 0; i < segment->capacity; i++) {
+        if (segment->pages[i].is_committed) {
+          mi_segment_protect_range((uint8_t*)segment + (i+1)*page_size - os_psize, os_psize, protect);
+        }
+      }
+    }
+    #endif
+  }
+}
+
+/* -----------------------------------------------------------
+  Page reset
+----------------------------------------------------------- */
+
+static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  // todo: should we purge the guard page as well when MI_SECURE>=2 ?
+  mi_assert_internal(page->is_committed);
+  mi_assert_internal(!page->segment_in_use);
+  if (!segment->allow_purge) return;
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld)); MI_UNUSED(tld);
+  size_t psize;
+  void* start = mi_segment_raw_page_start(segment, page, &psize);
+  const bool needs_recommit = _mi_os_purge(start, psize);
+  if (needs_recommit) { page->is_committed = false; }
+}
+
+static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  if (page->is_committed) return true;
+  mi_assert_internal(segment->allow_decommit);
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld)); MI_UNUSED(tld);
+
+  size_t psize;
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
+  bool is_zero = false;
+  const size_t gsize = (MI_SECURE >= 2 ? _mi_os_page_size() : 0);
+  bool ok = _mi_os_commit(start, psize + gsize, &is_zero);
+  if (!ok) return false; // failed to commit!
+  page->is_committed = true;
+  page->used = 0;
+  page->free = NULL;
+  page->is_zero_init = is_zero;
+  if (gsize > 0) {
+    mi_segment_protect_range(start + psize, gsize, true);
+  }
+  return true;
+}
+
+
+/* -----------------------------------------------------------
+  The free page queue
+----------------------------------------------------------- */
+
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use subtraction to check for expiry which works
+// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
+static uint32_t mi_page_get_expire( mi_page_t* page ) {
+  return (uint32_t)((uintptr_t)page->free);
+}
+
+static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) {
+  page->free = (mi_block_t*)((uintptr_t)expire);
+}
+
+static void mi_page_purge_set_expire(mi_page_t* page) {
+  mi_assert_internal(mi_page_get_expire(page)==0);
+  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
+  mi_page_set_expire(page, expire);
+}
+
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use subtraction to check for expiry which work
+// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
+static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) {
+  int32_t expire = (int32_t)mi_page_get_expire(page);
+  return (((int32_t)now - expire) >= 0);
+}
+
+static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(!page->segment_in_use);
+  mi_assert_internal(mi_page_not_in_queue(page,tld));
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
+  mi_assert_internal(_mi_page_segment(page)==segment);
+  if (!segment->allow_purge) return;
+
+  if (mi_option_get(mi_option_purge_delay) == 0) {
+    // purge immediately?
+    mi_page_purge(segment, page, tld);
+  }
+  else if (mi_option_get(mi_option_purge_delay) > 0) {   // no purging if the delay is negative
+    // otherwise push on the delayed page reset queue
+    mi_page_queue_t* pq = &tld->pages_purge;
+    // push on top
+    mi_page_purge_set_expire(page);
+    page->next = pq->first;
+    page->prev = NULL;
+    if (pq->first == NULL) {
+      mi_assert_internal(pq->last == NULL);
+      pq->first = pq->last = page;
+    }
+    else {
+      pq->first->prev = page;
+      pq->first = page;
+    }
+  }
+}
+
+static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) {
+  if (mi_page_not_in_queue(page,tld)) return;
+
+  mi_page_queue_t* pq = &tld->pages_purge;
+  mi_assert_internal(pq!=NULL);
+  mi_assert_internal(!page->segment_in_use);
+  mi_assert_internal(mi_page_get_expire(page) != 0);
+  mi_assert_internal(mi_pages_purge_contains(page, tld));
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == pq->last)  pq->last = page->prev;
+  if (page == pq->first) pq->first = page->next;
+  page->next = page->prev = NULL;
+  mi_page_set_expire(page,0);
+}
+
+static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) {
+  if (segment->memid.is_pinned) return; // never reset in huge OS pages
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (!page->segment_in_use) {
+      mi_page_purge_remove(page, tld);
+      if (force_purge && page->is_committed) {
+        mi_page_purge(segment, page, tld);
+      }
+    }
+    else {
+      mi_assert_internal(mi_page_not_in_queue(page,tld));
+    }
+  }
+}
+
+static void mi_pages_try_purge(bool force, mi_segments_tld_t* tld) {
+  if (mi_option_get(mi_option_purge_delay) < 0) return;  // purging is not allowed
+
+  mi_msecs_t now = _mi_clock_now();
+  mi_page_queue_t* pq = &tld->pages_purge;
+  // from oldest up to the first that has not expired yet
+  mi_page_t* page = pq->last;
+  while (page != NULL && (force || mi_page_purge_is_expired(page,now))) {
+    mi_page_t* const prev = page->prev; // save previous field
+    mi_page_purge_remove(page, tld);    // remove from the list to maintain invariant for mi_page_purge
+    mi_page_purge(_mi_page_segment(page), page, tld);
+    page = prev;
+  }
+  // discard the reset pages from the queue
+  pq->last = page;
+  if (page != NULL){
+    page->next = NULL;
+  }
+  else {
+    pq->first = NULL;
+  }
+}
+
+
 /* -----------------------------------------------------------
  Segment size calculations
 ----------------------------------------------------------- */
 
-static size_t mi_segment_info_size(mi_segment_t* segment) {
-  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
+static size_t mi_segment_raw_page_size(const mi_segment_t* segment) {
+  return (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
 }
 
-static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t block_size, size_t* page_size)
-{
-  const ptrdiff_t idx = slice - segment->slices;
-  const size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
-  uint8_t* const pstart = (uint8_t*)segment + (idx*MI_SEGMENT_SLICE_SIZE);
-  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
-  // note: the offset must always be a block_size multiple since we assume small allocations
-  // are aligned (see `mi_heap_malloc_aligned`).
-  size_t start_offset = 0;
-  if (block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
-    // for small objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
-    const size_t adjust = block_size - ((uintptr_t)pstart % block_size);
-    if (adjust < block_size && psize >= block_size + adjust) {
-      start_offset += adjust;
-    }
+// Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+// The raw start is not taking aligned block allocation into consideration.
+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  size_t   psize = mi_segment_raw_page_size(segment);
+  uint8_t* p = (uint8_t*)segment + page->segment_idx * psize;
+
+  if (page->segment_idx == 0) {
+    // the first page starts after the segment info (and possible guard page)
+    p += segment->segment_info_size;
+    psize -= segment->segment_info_size;
   }
-  if (block_size >= MI_INTPTR_SIZE) {
-    if (block_size <= 64) { start_offset += 3*block_size; }
-    else if (block_size <= 512) { start_offset += block_size; }
+
+#if (MI_SECURE > 1)  // every page has an os guard page
+  psize -= _mi_os_page_size();
+#elif (MI_SECURE==1) // the last page has an os guard page at the end
+  if (page->segment_idx == segment->capacity - 1) {
+    psize -= _mi_os_page_size();
   }
-  start_offset = _mi_align_up(start_offset, MI_MAX_ALIGN_SIZE);
-  mi_assert_internal(_mi_is_aligned(pstart + start_offset, MI_MAX_ALIGN_SIZE));
-  mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(pstart + start_offset,block_size));
-  if (page_size != NULL) { *page_size = psize - start_offset; }
-  return (pstart + start_offset);
+#endif
+
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
 }
 
-// Start of the page available memory; can be used on uninitialized pages
+// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
 uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
-  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
-  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, mi_page_block_size(page), page_size);
-  mi_assert_internal(mi_page_block_size(page) > 0 || _mi_ptr_page(p) == page);
+  size_t   psize;
+  uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
+  const size_t block_size = mi_page_block_size(page);
+  if (/*page->segment_idx == 0 &&*/ block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
+    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
+    size_t adjust = block_size - ((uintptr_t)p % block_size);
+    if (adjust < block_size && psize >= block_size + adjust) {
+      p += adjust;
+      psize -= adjust;
+      mi_assert_internal((uintptr_t)p % block_size == 0);
+    }
+  }
+  mi_assert_internal(_mi_is_aligned(p, MI_MAX_ALIGN_SIZE));
+  mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(p,block_size));
+
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(_mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
 
-static size_t mi_segment_calculate_slices(size_t required, size_t* info_slices) {
-  size_t page_size = _mi_os_page_size();
-  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
+static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
+{
+  const size_t minsize = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
   size_t guardsize = 0;
+  size_t isize     = 0;
 
-  if (MI_SECURE>0) {
+
+  if (MI_SECURE == 0) {
+    // normally no guard pages
+    #if MI_GUARDED
+    isize = _mi_align_up(minsize, _mi_os_page_size());
+    #else
+    isize = _mi_align_up(minsize, 16 * MI_MAX_ALIGN_SIZE);
+    #endif
+  }
+  else {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data (and one at the end of the segment)
+    const size_t page_size = _mi_os_page_size();
+    isize = _mi_align_up(minsize, page_size);
     guardsize = page_size;
-    if (required > 0) {
-      required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size;
-    }
+    //required = _mi_align_up(required, isize + guardsize);
   }
 
-  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
-  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
-  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );
-  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
-  return (segment_size / MI_SEGMENT_SLICE_SIZE);
+  if (info_size != NULL) *info_size = isize;
+  if (pre_size != NULL)  *pre_size  = isize + guardsize;
+  return (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + 2*guardsize, MI_PAGE_HUGE_ALIGN) );
 }
 
 
@@ -404,424 +491,43 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
   if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
 }
 
-static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
+static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
   segment->thread_id = 0;
   _mi_segment_map_freed_at(segment);
-  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
+  mi_segments_track_size(-((long)segment_size),tld);
   if (segment->was_reclaimed) {
     tld->reclaim_count--;
     segment->was_reclaimed = false;
   }
-  if (MI_SECURE>0) {
-    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
-    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
-    size_t os_pagesize = _mi_os_page_size();
-    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    _mi_os_unprotect(end, os_pagesize);
+
+  if (MI_SECURE != 0) {
+    mi_assert_internal(!segment->memid.is_pinned);
+    mi_segment_protect(segment, false); // ensure no more guard pages are set
   }
 
-  // purge delayed decommits now? (no, leave it to the arena)
-  // mi_segment_try_purge(segment,true,tld->stats);
+  bool fully_committed = true;
+  size_t committed_size = 0;
+  const size_t page_size = mi_segment_raw_page_size(segment);
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->is_committed)  { committed_size += page_size;  }
+    if (!page->is_committed) { fully_committed = false; }
+  }
+  MI_UNUSED(fully_committed);
+  mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
 
-  const size_t size = mi_segment_size(segment);
-  const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
-
-  _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid);
+  _mi_arena_free(segment, segment_size, committed_size, segment->memid);
 }
 
-/* -----------------------------------------------------------
-   Commit/Decommit ranges
------------------------------------------------------------ */
-
-static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
-  mi_assert_internal(_mi_ptr_segment(p + 1) == segment);
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  mi_commit_mask_create_empty(cm);
-  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
-  const size_t segstart = mi_segment_info_size(segment);
-  const size_t segsize = mi_segment_size(segment);
-  if (p >= (uint8_t*)segment + segsize) return;
-
-  size_t pstart = (p - (uint8_t*)segment);
-  mi_assert_internal(pstart + size <= segsize);
-
-  size_t start;
-  size_t end;
-  if (conservative) {
-    // decommit conservative
-    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
-    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
-    mi_assert_internal(start >= segstart);
-    mi_assert_internal(end <= segsize);
+// called from `heap_collect`.
+void _mi_segments_collect(bool force, mi_segments_tld_t* tld) {
+  mi_pages_try_purge(force,tld);
+  #if MI_DEBUG>=2
+  if (!_mi_is_main_thread()) {
+    mi_assert_internal(tld->pages_purge.first == NULL);
+    mi_assert_internal(tld->pages_purge.last == NULL);
   }
-  else {
-    // commit liberal
-    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
-    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
-  }
-  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
-    start = segstart;
-  }
-  if (end > segsize) {
-    end = segsize;
-  }
-
-  mi_assert_internal(start <= pstart && (pstart + size) <= end);
-  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
-  *start_p   = (uint8_t*)segment + start;
-  *full_size = (end > start ? end - start : 0);
-  if (*full_size == 0) return;
-
-  size_t bitidx = start / MI_COMMIT_SIZE;
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-
-  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
-  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
-    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
-  }
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  mi_commit_mask_create(bitidx, bitcount, cm);
-}
-
-static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-
-  // commit liberal
-  uint8_t* start = NULL;
-  size_t   full_size = 0;
-  mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, false /* conservative? */, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size == 0) return true;
-
-  if (!mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
-    // committing
-    bool is_zero = false;
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (!_mi_os_commit(start, full_size, &is_zero)) return false;
-    mi_commit_mask_set(&segment->commit_mask, &mask);
-  }
-
-  // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon.
-  if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) {
-    segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay);
-  }
-
-  // always clear any delayed purges in our range (as they are either committed now)
-  mi_commit_mask_clear(&segment->purge_mask, &mask);
-  return true;
-}
-
-static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
-  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->purge_mask)) return true; // fully committed
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  return mi_segment_commit(segment, p, size);
-}
-
-static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-  if (!segment->allow_purge) return true;
-
-  // purge conservative
-  uint8_t* start = NULL;
-  size_t   full_size = 0;
-  mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, true /* conservative? */, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
-
-  if (mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
-    // purging
-    mi_assert_internal((void*)start != (void*)segment);
-    mi_assert_internal(segment->allow_decommit);
-    const bool decommitted = _mi_os_purge(start, full_size);  // reset or decommit
-    if (decommitted) {
-      mi_commit_mask_t cmask;
-      mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-      _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting
-      mi_commit_mask_clear(&segment->commit_mask, &mask);
-    }
-  }
-
-  // always clear any scheduled purges in our range
-  mi_commit_mask_clear(&segment->purge_mask, &mask);
-  return true;
-}
-
-static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size) {
-  if (!segment->allow_purge) return;
-
-  if (mi_option_get(mi_option_purge_delay) == 0) {
-    mi_segment_purge(segment, p, size);
-  }
-  else {
-    // register for future purge in the purge mask
-    uint8_t* start = NULL;
-    size_t   full_size = 0;
-    mi_commit_mask_t mask;
-    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
-    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
-
-    // update delayed commit
-    mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask));
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only purge what is committed; span_free may try to decommit more
-    mi_commit_mask_set(&segment->purge_mask, &cmask);
-    mi_msecs_t now = _mi_clock_now();
-    if (segment->purge_expire == 0) {
-      // no previous purgess, initialize now
-      segment->purge_expire = now + mi_option_get(mi_option_purge_delay);
-    }
-    else if (segment->purge_expire <= now) {
-      // previous purge mask already expired
-      if (segment->purge_expire + mi_option_get(mi_option_purge_extend_delay) <= now) {
-        mi_segment_try_purge(segment, true);
-      }
-      else {
-        segment->purge_expire = now + mi_option_get(mi_option_purge_extend_delay); // (mi_option_get(mi_option_purge_delay) / 8); // wait a tiny bit longer in case there is a series of free's
-      }
-    }
-    else {
-      // previous purge mask is not yet expired, increase the expiration by a bit.
-      segment->purge_expire += mi_option_get(mi_option_purge_extend_delay);
-    }
-  }
-}
-
-static void mi_segment_try_purge(mi_segment_t* segment, bool force) {
-  if (!segment->allow_purge || segment->purge_expire == 0 || mi_commit_mask_is_empty(&segment->purge_mask)) return;
-  mi_msecs_t now = _mi_clock_now();
-  if (!force && now < segment->purge_expire) return;
-
-  mi_commit_mask_t mask = segment->purge_mask;
-  segment->purge_expire = 0;
-  mi_commit_mask_create_empty(&segment->purge_mask);
-
-  size_t idx;
-  size_t count;
-  mi_commit_mask_foreach(&mask, idx, count) {
-    // if found, decommit that sequence
-    if (count > 0) {
-      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
-      size_t size = count * MI_COMMIT_SIZE;
-      mi_segment_purge(segment, p, size);
-    }
-  }
-  mi_commit_mask_foreach_end()
-  mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
-}
-
-// called from `mi_heap_collect_ex`
-// this can be called per-page so it is important that try_purge has fast exit path
-void _mi_segment_collect(mi_segment_t* segment, bool force) {
-  mi_segment_try_purge(segment, force);
-}
-
-/* -----------------------------------------------------------
-   Span free
------------------------------------------------------------ */
-
-static bool mi_segment_is_abandoned(mi_segment_t* segment) {
-  return (mi_atomic_load_relaxed(&segment->thread_id) == 0);
-}
-
-// note: can be called on abandoned segments
-static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment)
-                          ? NULL : mi_span_queue_for(slice_count,tld));
-  if (slice_count==0) slice_count = 1;
-  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
-
-  // set first and last slice (the intermediates can be undetermined)
-  mi_slice_t* slice = &segment->slices[slice_index];
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
-  slice->slice_offset = 0;
-  if (slice_count > 1) {
-    mi_slice_t* last = slice + slice_count - 1;
-    mi_slice_t* end  = (mi_slice_t*)mi_segment_slices_end(segment);
-    if (last > end) { last = end; }
-    last->slice_count = 0;
-    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
-    last->block_size = 0;
-  }
-
-  // perhaps decommit
-  if (allow_purge) {
-    mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE);
-  }
-
-  // and push it on the free page queue (if it was not a huge page)
-  if (sq != NULL) mi_span_queue_push( sq, slice );
-             else slice->block_size = 0; // mark huge page as free anyways
-}
-
-/*
-// called from reclaim to add existing free spans
-static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  size_t slice_index = mi_slice_index(slice);
-  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
-}
-*/
-
-static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->block_size==0);
-  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
-  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
-  mi_span_queue_delete(sq, slice);
-}
-
-// note: can be called on abandoned segments
-static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
-  mi_segment_t* const segment = _mi_ptr_segment(slice);
-
-  // for huge pages, just mark as free but don't add to the queues
-  if (segment->kind == MI_SEGMENT_HUGE) {
-    // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case)
-    mi_assert_internal((segment->used==0 && slice->block_size==0) || segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
-    slice->block_size = 0;  // mark as free anyways
-    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to
-    // avoid a possible cache miss (and the segment is about to be freed)
-    return slice;
-  }
-
-  // otherwise coalesce the span and add to the free span queues
-  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
-  size_t slice_count = slice->slice_count;
-  mi_slice_t* next = slice + slice->slice_count;
-  mi_assert_internal(next <= mi_segment_slices_end(segment));
-  if (next < mi_segment_slices_end(segment) && next->block_size==0) {
-    // free next block -- remove it from free and merge
-    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
-    slice_count += next->slice_count; // extend
-    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
-  }
-  if (slice > segment->slices) {
-    mi_slice_t* prev = mi_slice_first(slice - 1);
-    mi_assert_internal(prev >= segment->slices);
-    if (prev->block_size==0) {
-      // free previous slice -- remove it from free and merge
-      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
-      slice_count += prev->slice_count;
-      slice->slice_count = 0;
-      slice->slice_offset = (uint32_t)((uint8_t*)slice - (uint8_t*)prev); // set the slice offset for `segment_force_abandon` (in case the previous free block is very large).
-      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
-      slice = prev;
-    }
-  }
-
-  // and add the new free page
-  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld);
-  return slice;
-}
-
-
-
-/* -----------------------------------------------------------
-   Page allocation
------------------------------------------------------------ */
-
-// Note: may still return NULL if committing the memory failed
-static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_slice_t* const slice = &segment->slices[slice_index];
-  mi_assert_internal(slice->block_size==0 || slice->block_size==1);
-
-  // commit before changing the slice data
-  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE)) {
-    return NULL;  // commit failed!
-  }
-
-  // convert the slices to a page
-  slice->slice_offset = 0;
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count);
-  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
-  slice->block_size = bsize;
-  mi_page_t*  page = mi_slice_to_page(slice);
-  mi_assert_internal(mi_page_block_size(page) == bsize);
-
-  // set slice back pointers for the first MI_MAX_SLICE_OFFSET_COUNT entries
-  size_t extra = slice_count-1;
-  if (extra > MI_MAX_SLICE_OFFSET_COUNT) extra = MI_MAX_SLICE_OFFSET_COUNT;
-  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
-
-  mi_slice_t* slice_next = slice + 1;
-  for (size_t i = 1; i <= extra; i++, slice_next++) {
-    slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
-    slice_next->slice_count = 0;
-    slice_next->block_size = 1;
-  }
-
-  // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments)
-  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
-  mi_slice_t* last = slice + slice_count - 1;
-  mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment);
-  if (last > end) last = end;
-  if (last > slice) {
-    last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice));
-    last->slice_count = 0;
-    last->block_size = 1;
-  }
-
-  // and initialize the page
-  page->is_committed = true;
-  page->is_huge = (segment->kind == MI_SEGMENT_HUGE);
-  segment->used++;
-  return page;
-}
-
-static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(_mi_ptr_segment(slice) == segment);
-  mi_assert_internal(slice->slice_count >= slice_count);
-  mi_assert_internal(slice->block_size > 0); // no more in free queue
-  if (slice->slice_count <= slice_count) return;
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  size_t next_index = mi_slice_index(slice) + slice_count;
-  size_t next_count = slice->slice_count - slice_count;
-  mi_segment_span_free(segment, next_index, next_count, false /* don't purge left-over part */, tld);
-  slice->slice_count = (uint32_t)slice_count;
-}
-
-static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
-  // search from best fit up
-  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
-  if (slice_count == 0) slice_count = 1;
-  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
-    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
-      if (slice->slice_count >= slice_count) {
-        // found one
-        mi_segment_t* segment = _mi_ptr_segment(slice);
-        if (_mi_arena_memid_is_suitable(segment->memid, req_arena_id)) {
-          // found a suitable page span
-          mi_span_queue_delete(sq, slice);
-
-          if (slice->slice_count > slice_count) {
-            mi_segment_slice_split(segment, slice, slice_count, tld);
-          }
-          mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0);
-          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count);
-          if (page == NULL) {
-            // commit failed; return NULL but first restore the slice
-            mi_segment_span_free_coalesce(slice, tld);
-            return NULL;
-          }
-          return page;
-        }
-      }
-    }
-    sq++;
-  }
-  // could not find a page..
-  return NULL;
+  #endif
 }
 
 
@@ -829,223 +535,215 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
    Segment allocation
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
-                                          size_t* psegment_slices, size_t* pinfo_slices,
-                                          bool commit, mi_segments_tld_t* tld)
-
+static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignment, mi_arena_id_t req_arena_id,
+                                         size_t pre_size, size_t info_size, bool commit, size_t segment_size,
+                                         mi_segments_tld_t* tld)
 {
   mi_memid_t memid;
   bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
   size_t align_offset = 0;
-  size_t alignment = MI_SEGMENT_ALIGN;
-
+  size_t alignment = MI_SEGMENT_SIZE;
   if (page_alignment > 0) {
-    // mi_assert_internal(huge_page != NULL);
-    mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
     alignment = page_alignment;
-    const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE;
-    align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
-    const size_t extra = align_offset - info_size;
-    // recalculate due to potential guard pages
-    *psegment_slices = mi_segment_calculate_slices(required + extra, pinfo_slices);
-    mi_assert_internal(*psegment_slices > 0 && *psegment_slices <= UINT32_MAX);
+    align_offset = _mi_align_up(pre_size, MI_SEGMENT_SIZE);
+    segment_size = segment_size + (align_offset - pre_size);  // adjust the segment size
   }
 
-  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
   mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid);
   if (segment == NULL) {
     return NULL;  // failed to allocate
   }
 
-  // ensure metadata part of the segment is committed
-  mi_commit_mask_t commit_mask;
-  if (memid.initially_committed) {
-    mi_commit_mask_create_full(&commit_mask);
-  }
-  else {
-    // at least commit the info slices
-    const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-    mi_assert_internal(commit_needed>0);
-    mi_commit_mask_create(0, commit_needed, &commit_mask);
-    mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
-    if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL)) {
-      _mi_arena_free(segment,segment_size,0,memid);
+  if (!memid.initially_committed) {
+    // ensure the initial info is committed
+    mi_assert_internal(!memid.is_pinned);
+    bool ok = _mi_os_commit(segment, pre_size, NULL);
+    if (!ok) {
+      // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
+      _mi_arena_free(segment, segment_size, 0, memid);
       return NULL;
     }
   }
-  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
 
+  MI_UNUSED(info_size);
   segment->memid = memid;
   segment->allow_decommit = !memid.is_pinned;
   segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
   segment->segment_size = segment_size;
   segment->subproc = tld->subproc;
-  segment->commit_mask = commit_mask;
-  segment->purge_expire = 0;
-  mi_commit_mask_create_empty(&segment->purge_mask);
-
   mi_segments_track_size((long)(segment_size), tld);
   _mi_segment_map_allocated_at(segment);
   return segment;
 }
 
-
 // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_page_t** huge_page)
+static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, size_t page_alignment,
+                                      mi_arena_id_t req_arena_id, mi_segments_tld_t* tld)
 {
-  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
+  // required is only > 0 for huge page allocations
+  mi_assert_internal((required > 0 && page_kind > MI_PAGE_LARGE)|| (required==0 && page_kind <= MI_PAGE_LARGE));
 
   // calculate needed sizes first
-  size_t info_slices;
-  size_t segment_slices = mi_segment_calculate_slices(required, &info_slices);
-  mi_assert_internal(segment_slices > 0 && segment_slices <= UINT32_MAX);
-
-  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
-  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
-                            _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
-                            tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (required > 0);
-
-  // Allocate the segment from the OS
-  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
-                                              &segment_slices, &info_slices, commit, tld);
-  if (segment == NULL) return NULL;
-
-  // zero the segment info? -- not always needed as it may be zero initialized from the OS
-  if (!segment->memid.initially_zero) {
-    ptrdiff_t ofs    = offsetof(mi_segment_t, next);
-    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more
-    _mi_memzero((uint8_t*)segment + ofs, zsize);
-  }
-
-  // initialize the rest of the segment info
-  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
-  segment->segment_slices = segment_slices;
-  segment->segment_info_slices = info_slices;
-  segment->thread_id = _mi_thread_id();
-  segment->cookie = _mi_ptr_cookie(segment);
-  segment->slice_entries = slice_entries;
-  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
-
-  // _mi_memzero(segment->slices, sizeof(mi_slice_t)*(info_slices+1));
-  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
-
-  // set up guard pages
-  size_t guard_slices = 0;
-  if (MI_SECURE>0) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();
-    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    mi_segment_ensure_committed(segment, end, os_pagesize);
-    _mi_os_protect(end, os_pagesize);
-    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
-    guard_slices = 1;
-  }
-
-  // reserve first slices for segment info
-  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices);
-  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance
-  mi_assert_internal(segment->used == 1);
-  segment->used = 0; // don't count our internal slices towards usage
-
-  // initialize initial free pages
-  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
-    mi_assert_internal(huge_page==NULL);
-    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't purge */, tld);
+  size_t capacity;
+  if (page_kind == MI_PAGE_HUGE) {
+    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT + 1 && required > 0);
+    capacity = 1;
   }
   else {
-    mi_assert_internal(huge_page!=NULL);
-    mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
-    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
-    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices);
-    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance
+    mi_assert_internal(required == 0 && page_alignment == 0);
+    size_t page_size = (size_t)1 << page_shift;
+    capacity = MI_SEGMENT_SIZE / page_size;
+    mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
+    mi_assert_internal(capacity >= 1 && capacity <= MI_SMALL_PAGES_PER_SEGMENT);
+  }
+  size_t info_size;
+  size_t pre_size;
+  const size_t init_segment_size = mi_segment_calculate_sizes(capacity, required, &pre_size, &info_size);
+  mi_assert_internal(init_segment_size >= required);
+
+  // Initialize parameters
+  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM &&          // don't delay for large objects
+                              // !_mi_os_has_overcommit() &&          // never delay on overcommit systems
+                              _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
+                              tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
+  const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE);
+
+  // Allocate the segment from the OS (segment_size can change due to alignment)
+  mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld);
+  if (segment == NULL) return NULL;
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+  mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
+
+  // zero the segment info (but not the `mem` fields)
+  ptrdiff_t ofs = offsetof(mi_segment_t, next);
+  _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
+
+  // initialize pages info
+  const bool is_huge = (page_kind == MI_PAGE_HUGE);
+  for (size_t i = 0; i < capacity; i++) {
+    mi_assert_internal(i <= 255);
+    segment->pages[i].segment_idx = (uint8_t)i;
+    segment->pages[i].is_committed = segment->memid.initially_committed;
+    segment->pages[i].is_zero_init = segment->memid.initially_zero;
+    segment->pages[i].is_huge = is_huge;
+  }
+
+  // initialize
+  segment->page_kind  = page_kind;
+  segment->capacity   = capacity;
+  segment->page_shift = page_shift;
+  segment->segment_info_size = pre_size;
+  segment->thread_id  = _mi_thread_id();
+  segment->cookie     = _mi_ptr_cookie(segment);
+
+  // set protection
+  mi_segment_protect(segment, true);
+
+  // insert in free lists for small and medium pages
+  if (page_kind <= MI_PAGE_MEDIUM) {
+    mi_segment_insert_in_free_queue(segment, tld);
   }
 
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
   return segment;
 }
 
 
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
   MI_UNUSED(force);
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(segment->next == NULL);
-  mi_assert_internal(segment->used == 0);
+  mi_assert(segment != NULL);
 
   // in `mi_segment_force_abandon` we set this to true to ensure the segment's memory stays valid
   if (segment->dont_free) return;
 
-  // Remove the free pages
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  #if MI_DEBUG>1
-  size_t page_count = 0;
-  #endif
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    mi_assert_internal(mi_slice_index(slice)==0 || slice->block_size == 0); // no more used pages ..
-    if (slice->block_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
-      mi_segment_span_remove_from_queue(slice, tld);
-    }
-    #if MI_DEBUG>1
-    page_count++;
-    #endif
-    slice = slice + slice->slice_count;
-  }
-  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
+  // don't purge as we are freeing now
+  mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld);
+  mi_segment_remove_from_free_queue(segment, tld);
 
-  // stats
-  // _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
+  mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
+  mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
+  mi_assert(segment->next == NULL);
+  mi_assert(segment->prev == NULL);
+  // _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);
 
   // return it to the OS
-  mi_segment_os_free(segment, tld);
+  mi_segment_os_free(segment, segment->segment_size, tld);
+}
+
+/* -----------------------------------------------------------
+  Free page management inside a segment
+----------------------------------------------------------- */
+
+
+static bool mi_segment_has_free(const mi_segment_t* segment) {
+  return (segment->used < segment->capacity);
+}
+
+static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(_mi_page_segment(page) == segment);
+  mi_assert_internal(!page->segment_in_use);
+  mi_page_purge_remove(page, tld);
+
+  // check commit
+  if (!mi_page_ensure_committed(segment, page, tld)) return false;
+
+  // set in-use before doing unreset to prevent delayed reset
+  page->segment_in_use = true;
+  segment->used++;
+  mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld));
+  mi_assert_internal(segment->used <= segment->capacity);
+  if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) {
+    // if no more free pages, remove from the queue
+    mi_assert_internal(!mi_segment_has_free(segment));
+    mi_segment_remove_from_free_queue(segment, tld);
+  }
+  return true;
 }
 
 
 /* -----------------------------------------------------------
-   Page Free
+   Free
 ----------------------------------------------------------- */
 
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
 
-// note: can be called on abandoned pages
-static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(page->block_size > 0);
+// clear page data; can be called on abandoned segments
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld)
+{
+  mi_assert_internal(page->segment_in_use);
   mi_assert_internal(mi_page_all_free(page));
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(page->is_committed);
+  mi_assert_internal(mi_page_not_in_queue(page, tld));
 
   size_t inuse = page->capacity * mi_page_block_size(page);
   _mi_stat_decrease(&tld->stats->page_committed, inuse);
   _mi_stat_decrease(&tld->stats->pages, 1);
-  _mi_stat_decrease(&tld->stats->page_bins[_mi_page_bin(page)], 1);
 
-  // reset the page memory to reduce memory pressure?
-  if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
-    size_t psize;
-    uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-    _mi_os_reset(start, psize);
-  }
-
-  // zero the page data, but not the segment fields and heap tag
   page->is_zero_init = false;
-  uint8_t heap_tag = page->heap_tag;
-  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
-  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
-  page->block_size = 1;
-  page->heap_tag = heap_tag;
+  page->segment_in_use = false;
 
-  // and free it
-  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);
+  // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations)
+  size_t block_size = page->block_size;
+  uint8_t block_size_shift = page->block_size_shift;
+  uint8_t heap_tag = page->heap_tag;
+  uint8_t* page_start = page->page_start;
+  uint16_t capacity = page->capacity;
+  uint16_t reserved = page->reserved;
+  ptrdiff_t ofs = offsetof(mi_page_t,capacity);
+  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
+  page->capacity = capacity;
+  page->reserved = reserved;
+  page->block_size = block_size;
+  page->block_size_shift = block_size_shift;
+  page->heap_tag = heap_tag;
+  page->page_start = page_start;
   segment->used--;
-  // cannot assert segment valid as it is called during reclaim
-  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  return slice;
+
+  // schedule purge
+  mi_segment_schedule_purge(segment, page, tld);
+
+  page->capacity = 0;  // after purge these can be zero'd now
+  page->reserved = 0;
 }
 
 void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
@@ -1053,22 +751,27 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
   mi_assert(page != NULL);
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  mi_pages_try_purge(false /*force?*/, tld);
 
   // mark it as free now
-  mi_segment_page_clear(page, tld);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  mi_segment_page_clear(segment, page, tld);
 
   if (segment->used == 0) {
     // no more used pages; remove from the free list and free the segment
     mi_segment_free(segment, force, tld);
   }
-  else if (segment->used == segment->abandoned) {
-    // only abandoned pages; remove from free list and abandon
-    mi_segment_abandon(segment,tld);
-  }
   else {
-    // perform delayed purges
-    mi_segment_try_purge(segment, false /* force? */);
+    if (segment->used == segment->abandoned) {
+      // only abandoned pages; remove from free list and abandon
+      mi_segment_abandon(segment,tld);
+    }
+    else if (segment->used + 1 == segment->capacity) {
+      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment
+      if (segment->page_kind <= MI_PAGE_MEDIUM) {
+        // move back to segments  free list
+        mi_segment_insert_in_free_queue(segment,tld);
+      }
+    }
   }
 }
 
@@ -1077,7 +780,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
 Abandonment
 
 When threads terminate, they can leave segments with
-live blocks (reachable through other threads). Such segments
+live blocks (reached through other threads). Such segments
 are "abandoned" and will be reclaimed by other threads to
 reuse their pages and/or free them eventually. The
 `thread_id` of such segments is 0.
@@ -1098,33 +801,22 @@ by scanning the arena memory
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
   mi_assert_internal(segment->used > 0);
-  mi_assert_internal(segment->abandoned_visits == 0);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
-  // remove the free pages from the free page queues
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (slice->block_size == 0) { // a free page
-      mi_segment_span_remove_from_queue(slice,tld);
-      slice->block_size = 0; // but keep it free
-    }
-    slice = slice + slice->slice_count;
-  }
+  // Potentially force purge. Only abandoned segments in arena memory can be
+  // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative.
+  mi_pages_try_purge(false /*force?*/,tld);
+  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) ||  mi_option_is_enabled(mi_option_abandoned_page_purge);
+  mi_segment_remove_all_purges(segment, force_purge, tld);
 
-  // perform delayed decommits (forcing is much slower on mstress)
-  // Only abandoned segments in arena memory can be reclaimed without a free
-  // so if a segment is not from an arena we force purge here to be conservative.
-  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge);
-  mi_segment_try_purge(segment, force_purge);
+  // remove the segment from the free page queue if needed
+  mi_segment_remove_from_free_queue(segment, tld);
+  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
 
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
-  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
-  segment->thread_id = 0;
-  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
+  mi_segments_track_size(-((long)segment->segment_size), tld);
+  segment->abandoned_visits = 0;
   if (segment->was_reclaimed) {
     tld->reclaim_count--;
     segment->was_reclaimed = false;
@@ -1137,10 +829,9 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
   mi_assert_internal(mi_page_heap(page) == NULL);
   mi_segment_t* segment = _mi_page_segment(page);
-
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  mi_assert_expensive(!mi_pages_purge_contains(page, tld));
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
   segment->abandoned++;
-
   _mi_stat_increase(&tld->stats->pages_abandoned, 1);
   mi_assert_internal(segment->abandoned <= segment->used);
   if (segment->used == segment->abandoned) {
@@ -1153,40 +844,24 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
   Reclaim abandoned pages
 ----------------------------------------------------------- */
 
-static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
-  mi_slice_t* slice = &segment->slices[0];
-  *end = mi_segment_slices_end(segment);
-  mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page
-  slice = slice + slice->slice_count; // skip the first segment allocated page
-  return slice;
-}
-
-// Possibly free pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld)
+// Possibly clear pages and check if free space is available
+static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
-  mi_assert_internal(mi_segment_is_abandoned(segment));
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
   bool has_page = false;
-
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) { // used page
+  size_t pages_used = 0;
+  size_t pages_used_empty = 0;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->segment_in_use) {
+      pages_used++;
       // ensure used count is up to date and collect potential concurrent frees
-      mi_page_t* const page = mi_slice_to_page(slice);
       _mi_page_free_collect(page, false);
       if (mi_page_all_free(page)) {
-        // if this page is all free now, free it without adding to any queues (yet)
-        mi_assert_internal(page->next == NULL && page->prev==NULL);
-        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-        segment->abandoned--;
-        slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
-        mi_assert_internal(!mi_slice_is_used(slice));
-        if (slice->slice_count >= slices_needed) {
-          has_page = true;
-        }
+        // if everything free already, page can be reused for some block size
+        // note: don't clear the page yet as we can only OS reset it once it is reclaimed
+        pages_used_empty++;
+        has_page = true;
       }
       else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
@@ -1194,17 +869,19 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s
       }
     }
     else {
-      // empty span
-      if (slice->slice_count >= slices_needed) {
-        has_page = true;
-      }
+      // whole empty page
+      has_page = true;
     }
-    slice = slice + slice->slice_count;
+  }
+  mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
+  if (all_pages_free != NULL) {
+    *all_pages_free = ((pages_used - pages_used_empty) == 0);
   }
   return has_page;
 }
 
-// Reclaim an abandoned segment; returns NULL if the segment was freed
+
+// Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
@@ -1215,25 +892,21 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   segment->abandoned_visits = 0;
   segment->was_reclaimed = true;
   tld->reclaim_count++;
-  mi_segments_track_size((long)mi_segment_size(segment), tld);
-  mi_assert_internal(segment->next == NULL);
+  mi_segments_track_size((long)segment->segment_size, tld);
+  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
   _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
 
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) {
-      // in use: reclaim the page in our heap
-      mi_page_t* page = mi_slice_to_page(slice);
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->segment_in_use) {
       mi_assert_internal(page->is_committed);
+      mi_assert_internal(mi_page_not_in_queue(page, tld));
       mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
       mi_assert_internal(mi_page_heap(page) == NULL);
-      mi_assert_internal(page->next == NULL && page->prev==NULL);
-      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
       segment->abandoned--;
+      mi_assert(page->next == NULL);
+      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
       // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap)
       mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
       if (target_heap == NULL) {
@@ -1245,8 +918,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
       _mi_page_free_collect(page, false); // ensure used count is up to date
       if (mi_page_all_free(page)) {
-        // if everything free by now, free the page
-        slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
+        // if everything free already, clear the page directly
+        mi_segment_page_clear(segment, page, tld);  // reset is ok now
       }
       else {
         // otherwise reclaim it into the heap
@@ -1256,22 +929,23 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
         }
       }
     }
-    else {
-      // the span is free, add it to our page queues
-      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
+    /* expired
+    else if (page->is_committed) {  // not in-use, and not reset yet
+      // note: do not reset as this includes pages that were not touched before
+      // mi_pages_purge_add(segment, page, tld);
     }
-    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
-    slice = slice + slice->slice_count;
+    */
   }
-
-  mi_assert(segment->abandoned == 0);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  if (segment->used == 0) {  // due to page_clear
+  mi_assert_internal(segment->abandoned == 0);
+  if (segment->used == 0) {
     mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
     mi_segment_free(segment, false, tld);
     return NULL;
   }
   else {
+    if (segment->page_kind <= MI_PAGE_MEDIUM && mi_segment_has_free(segment)) {
+      mi_segment_insert_in_free_queue(segment, tld);
+    }
     return segment;
   }
 }
@@ -1328,7 +1002,7 @@ static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) {
   return max_tries;
 }
 
-static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
+static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
   long max_tries = mi_segment_get_reclaim_tries(tld);
@@ -1346,8 +1020,9 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
     // Perhaps we can skip non-suitable ones in a better way?
     bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
-    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
+    bool all_pages_free;
+    bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees)
+    if (all_pages_free) {
       // free the segment (by forced reclaim) to make it available to other threads.
       // note1: we prefer to free a segment as that might lead to reclaiming another
       // segment that is still partially used.
@@ -1355,8 +1030,8 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
       // freeing but that would violate some invariants temporarily)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
-    else if (has_page && is_suitable) {
-      // found a large enough free span, or a page of the right block_size with free space
+    else if (has_page && segment->page_kind == page_kind && is_suitable) {
+      // found a free page of the right kind, or page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
       result = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
@@ -1367,9 +1042,9 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
-      // otherwise, push on the visited list so it gets not looked at too quickly again
+      // otherwise, mark it back as abandoned
+      // todo: reset delayed pages in the segment?
       max_tries++; // don't count this as a try since it was not suitable
-      mi_segment_try_purge(segment, false /* true force? */); // force purge if needed as we may not visit soon again
       _mi_arena_segment_mark_abandoned(segment);
     }
   }
@@ -1377,72 +1052,38 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
   return result;
 }
 
-// collect abandoned segments
-void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
-{
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, force /* blocking? */, &current);
-  long max_tries = (force ? (long)mi_atomic_load_relaxed(&tld->subproc->abandoned_count) : 1024);  // limit latency
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) {
-    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, purge if needed and push on the visited list
-      // note: forced purge can be expensive if many threads are destroyed/created as in mstress.
-      mi_segment_try_purge(segment, force);
-      _mi_arena_segment_mark_abandoned(segment);
-    }
-  }
-  _mi_arena_field_cursor_done(&current);
-}
 
 /* -----------------------------------------------------------
-   Force abandon a segment that is in use by our thread
+  Force abandon a segment that is in use by our thread
 ----------------------------------------------------------- */
 
 // force abandon a segment
 static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* tld)
 {
-  mi_assert_internal(!mi_segment_is_abandoned(segment));
+  mi_assert_internal(segment->abandoned < segment->used);
   mi_assert_internal(!segment->dont_free);
 
   // ensure the segment does not get free'd underneath us (so we can check if a page has been freed in `mi_page_force_abandon`)
   segment->dont_free = true;
 
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) {
-      // ensure used count is up to date and collect potential concurrent frees
-      mi_page_t* const page = mi_slice_to_page(slice);
-      _mi_page_free_collect(page, false);
-      {
-        // abandon the page if it is still in-use (this will free it if possible as well)
-        mi_assert_internal(segment->used > 0);
-        if (segment->used == segment->abandoned+1) {
-          // the last page.. abandon and return as the segment will be abandoned after this
-          // and we should no longer access it.
-          segment->dont_free = false;
-          _mi_page_force_abandon(page);
-          return;
-        }
-        else {
-          // abandon and continue
-          _mi_page_force_abandon(page);
-          // it might be freed, reset the slice (note: relies on coalesce setting the slice_offset)
-          slice = mi_slice_first(slice);
-        }
+  // for all pages
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->segment_in_use) {
+      // abandon the page if it is still in-use (this will free the page if possible as well (but not our segment))
+      mi_assert_internal(segment->used > 0);
+      if (segment->used == segment->abandoned+1) {
+        // the last page.. abandon and return as the segment will be abandoned after this
+        // and we should no longer access it.
+        segment->dont_free = false;
+        _mi_page_force_abandon(page);
+        return;
+      }
+      else {
+        // abandon and continue
+        _mi_page_force_abandon(page);
       }
     }
-    slice = slice + slice->slice_count;
   }
   segment->dont_free = false;
   mi_assert(segment->used == segment->abandoned);
@@ -1453,7 +1094,7 @@ static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* t
   }
   else {
     // perform delayed purges
-    mi_segment_try_purge(segment, false /* force? */);
+    mi_pages_try_purge(false /* force? */, tld);
   }
 }
 
@@ -1466,7 +1107,7 @@ static void mi_segments_try_abandon_to_target(mi_heap_t* heap, size_t target, mi
   // todo: we should maintain a list of segments per thread; for now, only consider segments from the heap full pages
   for (int i = 0; i < 64 && tld->count >= min_target; i++) {
     mi_page_t* page = heap->pages[MI_BIN_FULL].first;
-    while (page != NULL && mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX) {
+    while (page != NULL && mi_page_is_huge(page)) {
       page = page->next;
     }
     if (page==NULL) {
@@ -1502,8 +1143,9 @@ void mi_collect_reduce(size_t target_size) mi_attr_noexcept {
    Reclaim or allocate
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld)
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld)
 {
+  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
   mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
 
   // try to abandon some segments to increase reuse between threads
@@ -1511,83 +1153,126 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
 
   // 1. try to reclaim an abandoned segment
   bool reclaimed;
-  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
+  mi_assert_internal(segment == NULL || _mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
   if (reclaimed) {
     // reclaimed the right page right into the heap
-    mi_assert_internal(segment != NULL);
+    mi_assert_internal(segment != NULL && segment->page_kind == page_kind && page_kind <= MI_PAGE_LARGE);
     return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
   }
   else if (segment != NULL) {
-    // reclaimed a segment with a large enough empty span in it
+    // reclaimed a segment with empty pages (of `page_kind`) in it
     return segment;
   }
   // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, 0, heap->arena_id, tld, NULL);
+  return mi_segment_alloc(0, page_kind, page_shift, 0, heap->arena_id, tld);
 }
 
 
 /* -----------------------------------------------------------
-   Page allocation
+   Small page allocation
 ----------------------------------------------------------- */
 
-static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld)
-{
-  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
-
-  // find a free page
-  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
-  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
-  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
-  if (page==NULL) {
-    // no free page, allocate a new segment and try again
-    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld) == NULL) {
-      // OOM or reclaimed a good page in the heap
-      return NULL;
-    }
-    else {
-      // otherwise try again
-      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld);
+static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  for (size_t i = 0; i < segment->capacity; i++) {  // TODO: use a bitmap instead of search?
+    mi_page_t* page = &segment->pages[i];
+    if (!page->segment_in_use) {
+      bool ok = mi_segment_page_claim(segment, page, tld);
+      if (ok) return page;
     }
   }
-  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
-  mi_segment_try_purge(_mi_ptr_segment(page), false);
+  mi_assert(false);
+  return NULL;
+}
+
+// Allocate a page inside a segment. Requires that the page has free pages
+static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  return mi_segment_find_free(segment, tld);
+}
+
+static mi_page_t* mi_segment_page_try_alloc_in_queue(mi_heap_t* heap, mi_page_kind_t kind, mi_segments_tld_t* tld) {
+  // find an available segment the segment free queue
+  mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
+  for (mi_segment_t* segment = free_queue->first; segment != NULL; segment = segment->next) {
+    if (_mi_arena_memid_is_suitable(segment->memid, heap->arena_id) && mi_segment_has_free(segment)) {
+      return mi_segment_page_alloc_in(segment, tld);
+    }
+  }
+  return NULL;
+}
+
+static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld) {
+  mi_page_t* page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);
+  if (page == NULL) {
+    // possibly allocate or reclaim a fresh segment
+    mi_segment_t* const segment = mi_segment_reclaim_or_alloc(heap, block_size, kind, page_shift, tld);
+    if (segment == NULL) return NULL;  // return NULL if out-of-memory (or reclaimed)
+    mi_assert_internal(segment->page_kind==kind);
+    mi_assert_internal(segment->used < segment->capacity);
+    mi_assert_internal(_mi_arena_memid_is_suitable(segment->memid, heap->arena_id));
+    page = mi_segment_page_try_alloc_in_queue(heap, kind, tld);  // this should now succeed
+  }
+  mi_assert_internal(page != NULL);
+  #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
+  // verify it is committed
+  mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0;
+  #endif
   return page;
 }
 
+static mi_page_t* mi_segment_small_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld) {
+  return mi_segment_page_alloc(heap, block_size, MI_PAGE_SMALL,MI_SMALL_PAGE_SHIFT,tld);
+}
 
+static mi_page_t* mi_segment_medium_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld) {
+  return mi_segment_page_alloc(heap, block_size, MI_PAGE_MEDIUM, MI_MEDIUM_PAGE_SHIFT, tld);
+}
 
 /* -----------------------------------------------------------
-   Huge page allocation
+   large page allocation
 ----------------------------------------------------------- */
 
+static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld) {
+  mi_segment_t* segment = mi_segment_reclaim_or_alloc(heap,block_size,MI_PAGE_LARGE,MI_LARGE_PAGE_SHIFT,tld);
+  if (segment == NULL) return NULL;
+  mi_page_t* page = mi_segment_find_free(segment, tld);
+  mi_assert_internal(page != NULL);
+#if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
+  mi_segment_raw_page_start(segment, page, NULL)[0] = 0;
+#endif
+  return page;
+}
+
 static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld)
 {
-  mi_page_t* page = NULL;
-  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,&page);
-  if (segment == NULL || page==NULL) return NULL;
-  mi_assert_internal(segment->used==1);
-  mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT + 1, page_alignment, req_arena_id, tld);
+  if (segment == NULL) return NULL;
+  mi_assert_internal(mi_segment_page_size(segment) - segment->segment_info_size - (2*(MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= size);
   #if MI_HUGE_PAGE_ABANDON
-  segment->thread_id = 0; // huge segments are immediately abandoned
+  segment->thread_id = 0; // huge pages are immediately abandoned
+  mi_segments_track_size(-(long)segment->segment_size, tld);
   #endif
+  mi_page_t* page = mi_segment_find_free(segment, tld);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->is_huge);
 
   // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
   page->block_size = psize;
-  mi_assert_internal(page->is_huge);
 
-  // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
-  if (page_alignment > 0 && segment->allow_decommit) {
+  // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
+  if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
     mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
     mi_assert_internal(psize - (aligned_p - start) >= size);
-    uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
+    uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list
     ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_reset(decommit_start, decommit_size);   // note: cannot use segment_decommit on huge segments
+    _mi_os_reset(decommit_start, decommit_size);  // do not decommit as it may be in a region
   }
 
   return page;
@@ -1597,7 +1282,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
 // free huge block from another thread
 void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
   // huge page segments are always abandoned and can be freed immediately by any thread
-  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
+  mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
   mi_assert_internal(segment == _mi_page_segment(page));
   mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
 
@@ -1612,6 +1297,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
     page->is_zero_init = false;
     mi_assert(page->used == 0);
     mi_tld_t* tld = heap->tld;
+    mi_segments_track_size((long)segment->segment_size, &tld->segments);
     _mi_segment_page_free(page, true, &tld->segments);
   }
 #if (MI_DEBUG!=0)
@@ -1624,47 +1310,50 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
 #else
 // reset memory of a huge block from another thread
 void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  MI_UNUSED(page);
-  mi_assert_internal(segment->kind == MI_SEGMENT_HUGE);
+  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE);
   mi_assert_internal(segment == _mi_page_segment(page));
   mi_assert_internal(page->used == 1); // this is called just before the free
   mi_assert_internal(page->free == NULL);
-  if (segment->allow_decommit) {
-    size_t csize = mi_usable_size(block);
-    if (csize > sizeof(mi_block_t)) {
-      csize = csize - sizeof(mi_block_t);
+  if (segment->allow_decommit && page->is_committed) {
+    size_t usize = mi_usable_size(block);
+    if (usize > sizeof(mi_block_t)) {
+      usize = usize - sizeof(mi_block_t);
       uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
-      _mi_os_reset(p, csize);  // note: cannot use segment_decommit on huge segments
+      _mi_os_reset(p, usize);
     }
   }
 }
 #endif
 
 /* -----------------------------------------------------------
-   Page allocation and free
+   Page allocation
 ----------------------------------------------------------- */
+
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld) {
   mi_page_t* page;
   if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
     mi_assert_internal(_mi_is_power_of_two(page_alignment));
     mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
+    //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0);
     if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld);
+    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld);
   }
   else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld);
+    page = mi_segment_small_page_alloc(heap, block_size, tld);
   }
   else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld);
+    page = mi_segment_medium_page_alloc(heap, block_size, tld);
   }
-  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld);
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX /* || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t)) */ ) {
+    page = mi_segment_large_page_alloc(heap, block_size, tld);
   }
   else {
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld);
+    page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld);
   }
-  mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
   mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  // mi_segment_try_purge(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
   mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
 }
@@ -1687,16 +1376,13 @@ static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_v
 }
 
 bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    if (mi_slice_is_used(slice)) {
-      mi_page_t* const page = mi_slice_to_page(slice);
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* const page = &segment->pages[i];
+    if (page->segment_in_use) {
       if (heap_tag < 0 || (int)page->heap_tag == heap_tag) {
         if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false;
       }
     }
-    slice = slice + slice->slice_count;
   }
   return true;
 }
diff --git a/src/stats.c b/src/stats.c
index 34b3d4e4..1cfc3104 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -30,7 +30,6 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   {
     // add atomically (for abandoned pages)
     int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    // if (stat == &_mi_stats_main.committed) { mi_assert_internal(current + amount >= 0); };
     mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
     if (amount > 0) {
       mi_atomic_addi64_relaxed(&stat->total,amount);
@@ -62,25 +61,6 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 }
 
 
-static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
-  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed(&stat->total,amount);
-  }
-  else {
-    // adjust local
-    stat->current += amount;
-    stat->total += amount;
-  }
-}
-
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, -((int64_t)amount));
-}
-
 
 // must be thread safe as it is called from stats_merge
 static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
@@ -114,8 +94,8 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   }
   #endif
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
-  }
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);    
+  }  
 }
 
 #undef MI_STAT_COUNT
@@ -218,15 +198,6 @@ static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int
   _mi_fprintf(out, arg, "\n");
 }
 
-#if MI_STAT>1
-static void mi_stat_total_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
-  _mi_fprintf(out, arg, "%10s:", msg);
-  _mi_fprintf(out, arg, "%12s", " ");  // no peak
-  mi_print_amount(stat->total, unit, out, arg);
-  _mi_fprintf(out, arg, "\n");
-}
-#endif
-
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
   _mi_fprintf(out, arg, "%10s:", msg);
   mi_print_amount(stat->total, -1, out, arg);
@@ -243,7 +214,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
 
 
 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "current   ", "block   ", "total#   ");
+  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "current   ", "unit   ", "total#   ");
 }
 
 #if MI_STAT>1
@@ -312,20 +283,18 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   // and print using that
   mi_print_header(out,arg);
   #if MI_STAT>1
-  mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, "bin",out,arg);
+  mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, "normal",out,arg);
   #endif
   #if MI_STAT
-  mi_stat_print(&stats->malloc_normal, "binned", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
-  // mi_stat_print(&stats->malloc_large, "large", (stats->malloc_large_count.total == 0 ? 1 : -1), out, arg);
+  mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
   mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
   mi_stat_count_t total = { 0,0,0 };
   mi_stat_count_add_mt(&total, &stats->malloc_normal);
-  // mi_stat_count_add(&total, &stats->malloc_large);
   mi_stat_count_add_mt(&total, &stats->malloc_huge);
   mi_stat_print_ex(&total, "total", 1, out, arg, "");
   #endif
   #if MI_STAT>1
-  mi_stat_total_print(&stats->malloc_requested, "malloc req", 1, out, arg);
+  mi_stat_print_ex(&stats->malloc_requested, "malloc req", 1, out, arg, "");
   _mi_fprintf(out, arg, "\n");
   #endif
   mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
@@ -350,7 +319,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->page_searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %5i\n", "numa nodes", _mi_os_numa_node_count());
+  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
 
   size_t elapsed;
   size_t user_time;
@@ -361,9 +330,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   size_t peak_commit;
   size_t page_faults;
   mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %5zu.%03zu s\n", "elapsed", elapsed/1000, elapsed%1000);
-  _mi_fprintf(out, arg, "%10s: user: %zu.%03zu s, system: %zu.%03zu s, faults: %zu, rss: ", "process",
-              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, page_faults );
+  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
   if (peak_commit > 0) {
     _mi_fprintf(out, arg, ", commit: ");
@@ -397,10 +366,6 @@ void mi_stats_merge(void) mi_attr_noexcept {
   mi_stats_merge_from( mi_stats_get_default() );
 }
 
-void _mi_stats_merge_thread(mi_tld_t* tld) {
-  mi_stats_merge_from( &tld->stats );
-}
-
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
   mi_stats_merge_from(stats);
 }
@@ -504,7 +469,7 @@ static bool mi_heap_buf_expand(mi_heap_buf_t* hbuf) {
     hbuf->buf[hbuf->size-1] = 0;
   }
   if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false;
-  const size_t newsize = (hbuf->size == 0 ? mi_good_size(12*MI_KiB) : 2*hbuf->size);
+  const size_t newsize = (hbuf->size == 0 ? 2*MI_KiB : 2*hbuf->size);
   char* const  newbuf  = (char*)mi_rezalloc(hbuf->buf, newsize);
   if (newbuf == NULL) return false;
   hbuf->buf = newbuf;
@@ -531,12 +496,7 @@ static void mi_heap_buf_print_count_bin(mi_heap_buf_t* hbuf, const char* prefix,
   const size_t binsize = _mi_bin_size(bin);
   const size_t pagesize = (binsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_SMALL_PAGE_SIZE :
                             (binsize <= MI_MEDIUM_OBJ_SIZE_MAX ? MI_MEDIUM_PAGE_SIZE :
-                              #if MI_LARGE_PAGE_SIZE
-                              (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0)
-                              #else
-                              0
-                              #endif
-                              ));
+                              (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0)));
   char buf[128];
   _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : ""));
   buf[127] = 0;
@@ -629,7 +589,7 @@ char* mi_stats_get_json(size_t output_size, char* output_buf) mi_attr_noexcept {
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     mi_heap_buf_print_count_bin(&hbuf, "    ", &stats->page_bins[i], i, i!=MI_BIN_HUGE);
   }
-  mi_heap_buf_print(&hbuf, "  ]\n");
+  mi_heap_buf_print(&hbuf, "  ]\n");  
   mi_heap_buf_print(&hbuf, "}\n");
   return hbuf.buf;
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c5fff1a6..5905613c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -16,7 +16,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 # Import mimalloc (if installed)
-find_package(mimalloc 2.2 CONFIG REQUIRED)
+find_package(mimalloc 1.9 CONFIG REQUIRED)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_LIBRARY_DIR} (${MIMALLOC_VERSION_DIR})")
 
 
diff --git a/test/main-override-dep.cpp b/test/main-override-dep.cpp
index d89e3fca..e92f6fc4 100644
--- a/test/main-override-dep.cpp
+++ b/test/main-override-dep.cpp
@@ -1,7 +1,6 @@
 // Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. 
 // This is imported by the `mimalloc-test-override` project.
 #include <string>
-#include <iostream>
 #include "main-override-dep.h"
 
 std::string TestAllocInDll::GetString()
@@ -11,41 +10,6 @@ std::string TestAllocInDll::GetString()
 	const char* t = "test";
 	memcpy(test, t, 4);
 	std::string r = test;
-  std::cout << "override-dep: GetString: " << r << "\n";
 	delete[] test;
 	return r;
-}
-
-
-class Static {
-private:
-  void* p;
-public:
-  Static() {
-    printf("override-dep: static constructor\n");
-    p = malloc(64);
-    return;
-  }
-  ~Static() {
-    free(p);
-    printf("override-dep: static destructor\n");
-    return;
-  }
-};
-
-static Static s = Static();
-
-
-#include <windows.h>
-
-BOOL WINAPI DllMain(HINSTANCE module, DWORD reason, LPVOID reserved) {
-  (void)(reserved);
-  (void)(module);
-  if (reason==DLL_PROCESS_ATTACH) {
-    printf("override-dep: dll attach\n");
-  }
-  else if (reason==DLL_PROCESS_DETACH) {
-    printf("override-dep: dll detach\n");
-  }  
-  return TRUE;
-}
+}
\ No newline at end of file
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 420b8bf7..06d7baa5 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -10,6 +10,7 @@
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.
 
+static void mi_bins(void);
 
 static void double_free1();
 static void double_free2();
@@ -23,12 +24,11 @@ static void test_reserved(void);
 static void negative_stat(void);
 static void alloc_huge(void);
 static void test_heap_walk(void);
-static void test_heap_arena(void);
-static void test_align(void);
 static void test_canary_leak(void);
 static void test_manage_os_memory(void);
 // static void test_large_pages(void);
 
+
 int main() {
   mi_version();
   mi_stats_reset();
@@ -43,17 +43,15 @@ int main() {
   // corrupt_free();
   // block_overflow1();
   // block_overflow2();
-  test_canary_leak();
+  // test_canary_leak();
   // test_aslr();
   // invalid_free();
   // test_reserved();
   // negative_stat();
   // test_heap_walk();
   // alloc_huge();
-  // test_heap_walk();
-  // test_heap_arena();
-  // test_align();
-  
+
+
   void* p1 = malloc(78);
   void* p2 = malloc(24);
   free(p1);
@@ -69,7 +67,7 @@ int main() {
   free(p1);
   free(p2);
   free(s);
-  
+
   /* now test if override worked by allocating/freeing across the api's*/
   //p1 = mi_malloc(32);
   //free(p1);
@@ -84,13 +82,6 @@ int main() {
   return 0;
 }
 
-static void test_align() {
-  void* p = mi_malloc_aligned(256, 256);
-  if (((uintptr_t)p % 256) != 0) {
-    fprintf(stderr, "%p is not 256 alignend!\n", p);
-  }
-}
-
 static void invalid_free() {
   free((void*)0xBADBEEF);
   realloc((void*)0xBADBEEF,10);
@@ -248,20 +239,6 @@ static void test_heap_walk(void) {
   mi_heap_visit_blocks(heap, true, &test_visit, NULL);
 }
 
-static void test_heap_arena(void) {
-  mi_arena_id_t arena_id;
-  int err = mi_reserve_os_memory_ex(100 * 1024 * 1024, false /* commit */, false /* allow large */, true /* exclusive */, &arena_id);
-  if (err) abort();
-  mi_heap_t* heap = mi_heap_new_in_arena(arena_id);
-  for (int i = 0; i < 500000; i++) {
-    void* p = mi_heap_malloc(heap, 1024);
-    if (p == NULL) {
-      printf("out of memory after %d kb (expecting about 100_000kb)\n", i);
-      break;
-    }
-  }
-}
-
 static void test_canary_leak(void) {
   char* p = mi_mallocn_tp(char,23);
   for(int i = 0; i < 23; i++) {
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 75b409fd..db594acc 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -27,12 +27,9 @@ static void heap_late_free();         // issue #204
 static void padding_shrink();         // issue #209
 static void various_tests();
 static void test_mt_shutdown();
-static void large_alloc(void);        // issue #363
 static void fail_aslr();              // issue #372
 static void tsan_numa_test();         // issue #414
 static void strdup_test();            // issue #445
-static void bench_alloc_large(void);  // issue #xxx
-//static void test_large_migrate(void); // issue #691
 static void heap_thread_free_huge();
 static void test_std_string();        // issue #697
 static void test_thread_local();      // issue #944
@@ -40,7 +37,7 @@ static void test_thread_local();      // issue #944
 static void test_mixed1();             // issue #942
 static void test_stl_allocators();
 
-#if _WIN32
+#if x_WIN32
 #include "main-override-dep.h"
 static void test_dep();               // issue #981: test overriding in another DLL
 #else
@@ -58,20 +55,18 @@ int main() {
   //test_thread_local();
   // heap_thread_free_huge();
   /*
-   heap_thread_free_huge();
-   heap_thread_free_large();
-   heap_no_delete();
-   heap_late_free();
-   padding_shrink();
-   various_tests();
-   large_alloc();
-   tsan_numa_test();
-   strdup_test();
-  */
-  // test_stl_allocators();
-  // test_mt_shutdown();
-  // test_large_migrate();
+  heap_thread_free_large();
+  heap_no_delete();
+  heap_late_free();
+  padding_shrink();
 
+  tsan_numa_test();
+  */
+  /*
+  strdup_test();
+  test_stl_allocators();
+  test_mt_shutdown();
+  */
   //fail_aslr();
   mi_stats_print(NULL);
   return 0;
@@ -150,12 +145,11 @@ static bool test_stl_allocator1() {
 struct some_struct { int i; int j; double z; };
 
 
-#if _WIN32
+#if x_WIN32
 static void test_dep()
 {
   TestAllocInDll t;
   std::string s = t.GetString();
-  std::cout << "test_dep GetString: " << s << "\n";
 }
 #endif
 
@@ -364,7 +358,7 @@ static void heap_thread_free_large_worker() {
 
 static void heap_thread_free_large() {
   for (int i = 0; i < 100; i++) {
-    shared_p = mi_malloc_aligned(2 * 1024 * 1024 + 1, 8);
+    shared_p = mi_malloc_aligned(2*1024*1024 + 1, 8);
     auto t1 = std::thread(heap_thread_free_large_worker);
     t1.join();
   }
@@ -375,13 +369,14 @@ static void heap_thread_free_huge_worker() {
 }
 
 static void heap_thread_free_huge() {
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 10; i++) {
     shared_p = mi_malloc(1024 * 1024 * 1024);
     auto t1 = std::thread(heap_thread_free_huge_worker);
     t1.join();
   }
 }
 
+
 static void test_mt_shutdown()
 {
   const int threads = 5;
@@ -406,18 +401,6 @@ static void test_mt_shutdown()
   std::cout << "done" << std::endl;
 }
 
-// issue #363
-using namespace std;
-
-void large_alloc(void)
-{
-  char* a = new char[1ull << 25];
-  thread th([&] {
-    delete[] a;
-    });
-  th.join();
-}
-
 // issue #372
 static void fail_aslr() {
   size_t sz = (size_t)(4ULL << 40); // 4TiB
@@ -438,36 +421,6 @@ static void tsan_numa_test() {
   t1.join();
 }
 
-// issue #?
-#include <chrono>
-#include <random>
-#include <iostream>
-
-static void bench_alloc_large(void) {
-  static constexpr int kNumBuffers = 20;
-  static constexpr size_t kMinBufferSize = 5 * 1024 * 1024;
-  static constexpr size_t kMaxBufferSize = 25 * 1024 * 1024;
-  std::unique_ptr<char[]> buffers[kNumBuffers];
-
-  std::random_device rd;  (void)rd;
-  std::mt19937 gen(42); //rd());
-  std::uniform_int_distribution<> size_distribution(kMinBufferSize, kMaxBufferSize);
-  std::uniform_int_distribution<> buf_number_distribution(0, kNumBuffers - 1);
-
-  static constexpr int kNumIterations = 2000;
-  const auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < kNumIterations; ++i) {
-    int buffer_idx = buf_number_distribution(gen);
-    size_t new_size = size_distribution(gen);
-    buffers[buffer_idx] = std::make_unique<char[]>(new_size);
-  }
-  const auto end = std::chrono::steady_clock::now();
-  const auto num_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-  const auto us_per_allocation = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / kNumIterations;
-  std::cout << kNumIterations << " allocations Done in " << num_ms << "ms." << std::endl;
-  std::cout << "Avg " << us_per_allocation << " us per allocation" << std::endl;
-}
-
 
 class MTest
 {
@@ -494,4 +447,4 @@ void test_thread_local()
         mi_stats_print(NULL);
     }
     return;
-}
+}
\ No newline at end of file
diff --git a/test/test-api.c b/test/test-api.c
index 6f5d6722..15484544 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -203,11 +203,7 @@ int main(void) {
   CHECK_BODY("malloc-aligned9") { // test large alignments
     bool ok = true;
     void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 
-      #if SIZE_MAX > UINT32_MAX
-      2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 
-      #endif
-      0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
     for (int i = 0; i < 28 && ok; i++) {
       int align = (1 << i);
       for (int j = 0; j < 8 && ok; j++) {
diff --git a/test/test-stress.c b/test/test-stress.c
index 4f5a3d58..9e041064 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -320,17 +320,11 @@ int main(int argc, char** argv) {
 
   // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
   srand(0x7feb352d);
-  
-  //mi_reserve_os_memory(512ULL << 20, true, true);
-
-  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-  mi_stats_reset();
-  #endif
-
+  // mi_stats_reset();
 #ifdef STRESS
-  test_stress();
+    test_stress();
 #else
-  test_leak();
+    test_leak();
 #endif
 
 #ifndef USE_STD_MALLOC
@@ -343,7 +337,6 @@ int main(int argc, char** argv) {
     mi_free(json);
   }
   #endif
-  mi_collect(true);
   mi_stats_print(NULL);  
 #endif
   //bench_end_program();