2025-09-18 12:14:47 +03:00
51 changed files with 1474 additions and 2481 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -126,7 +126,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENE
  set(MI_ARCH "x64")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
  set(MI_ARCH "arm64")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567].?|ARM)$")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
  set(MI_ARCH "arm32")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
  if(CMAKE_SIZEOF_VOID_P==4)
@ -173,8 +173,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
    list(APPEND mi_cflags -Wall)
 endif()

-# force C++ compilation with msvc or clang-cl to use modern C++ atomics
-if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel" OR MI_CLANG_CL)
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
  set(MI_USE_CXX "ON")
 endif()

@ -435,7 +434,7 @@ endif()

 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
  if(MI_OPT_ARCH)
-    if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
+    if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES)   # to support multi-arch binaries (#999)
      if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
        list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
      endif()
@ -533,9 +532,7 @@ if(MI_TRACK_ASAN)
 endif()
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
 list(APPEND mi_defines "MI_CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE_LC}")  #todo: multi-config project needs $<CONFIG> ?
-if(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$")
-  list(APPEND mi_defines MI_BUILD_RELEASE)
-else()
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$"))
  set(mi_libname "${mi_libname}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
 endif()

@ -585,7 +582,7 @@ if(MI_BUILD_SHARED)
  install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
  install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})

-  if(WIN32 AND NOT MINGW)
+  if(WIN32)
    # On windows, the import library name for the dll would clash with the static mimalloc.lib library
    # so we postfix the dll import library with `.dll.lib` (and also the .pdb debug file)
    set_property(TARGET mimalloc PROPERTY ARCHIVE_OUTPUT_NAME "${mi_libname}.dll" )
@ -595,9 +592,6 @@ if(MI_BUILD_SHARED)
    # install(FILES "$<TARGET_FILE_DIR:mimalloc>/${mi_libname}.dll.pdb" DESTINATION ${CMAKE_INSTALL_LIBDIR})
  endif()
  if(WIN32 AND MI_WIN_REDIRECT)
-    if(MINGW)
-      set_property(TARGET mimalloc PROPERTY PREFIX "")
-    endif()
    # On windows, link and copy the mimalloc redirection dll too.
    if(CMAKE_GENERATOR_PLATFORM STREQUAL "arm64ec")
      set(MIMALLOC_REDIRECT_SUFFIX "-arm64ec")
@ -713,12 +707,10 @@ if (MI_BUILD_TESTS)
    target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
    target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})
    target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include)
-    if(MI_BUILD_STATIC AND NOT MI_DEBUG_TSAN)
-      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
-    elseif(MI_BUILD_SHARED)
+    if(MI_BUILD_SHARED AND (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
    else()
-      message(STATUS "cannot build TSAN tests without MI_BUILD_SHARED being enabled")
+      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
    endif()
    add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME})
  endforeach()
@ -727,19 +719,21 @@ if (MI_BUILD_TESTS)
  if(MI_BUILD_SHARED AND NOT (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
    add_executable(mimalloc-test-stress-dynamic test/test-stress.c)
    target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE ${mi_defines} "USE_STD_MALLOC=1")
+    if(WIN32)
+      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")
+    endif()
    target_compile_options(mimalloc-test-stress-dynamic PRIVATE ${mi_cflags})
    target_include_directories(mimalloc-test-stress-dynamic PRIVATE include)
+    target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # mi_version
    if(WIN32)
-      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")  # link mi_version
-      target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # link mi_version
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
    else()
      if(APPLE)
        set(LD_PRELOAD "DYLD_INSERT_LIBRARIES")
      else()
        set(LD_PRELOAD "LD_PRELOAD")
      endif()
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
    endif()
  endif()
 endif()
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -6,8 +6,10 @@
 trigger:
  branches:
    include:
-    - main
-    - dev*
+    - master
+    - dev
+    - dev2
+    - dev3
  tags:
    include:
    - v*
@ -32,22 +34,6 @@ jobs:
        BuildType: secure
        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
        MSBuildConfiguration: Release
-      Debug x86:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -A Win32
-        MSBuildConfiguration: Debug
-      Release x86:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -A Win32
-        MSBuildConfiguration: Release
-      Debug Fixed TLS:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Debug
-      Release Fixed TLS:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Release
  steps:
  - task: CMake@1
    inputs:
@ -175,7 +161,6 @@ jobs:
  - script: ctest --verbose --timeout 240
    workingDirectory: $(BuildType)
    displayName: CTest
-    
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-macos-$(BuildType)

@ -183,6 +168,35 @@ jobs:
 # Other OS versions (just debug mode)
 # ----------------------------------------------------------

+- job:
+  displayName: Windows 2019
+  pool:
+    vmImage:
+      windows-2019
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+        MSBuildConfiguration: Debug
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+        MSBuildConfiguration: Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - task: MSBuild@1
+    inputs:
+      solution: $(BuildType)/libmimalloc.sln
+      configuration: '$(MSBuildConfiguration)'
+      msbuildArguments: -m
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
 - job:
  displayName: Ubuntu 24.04
  pool:
--- a/bin/mimalloc-redirect-arm64.dll
+++ b/bin/mimalloc-redirect-arm64.dll
--- a/bin/mimalloc-redirect-arm64.lib
+++ b/bin/mimalloc-redirect-arm64.lib
--- a/bin/mimalloc-redirect-arm64ec.dll
+++ b/bin/mimalloc-redirect-arm64ec.dll
--- a/bin/mimalloc-redirect-arm64ec.lib
+++ b/bin/mimalloc-redirect-arm64ec.lib
--- a/bin/mimalloc-redirect.dll
+++ b/bin/mimalloc-redirect.dll
--- a/bin/mimalloc-redirect.lib
+++ b/bin/mimalloc-redirect.lib
--- a/bin/mimalloc-redirect32.dll
+++ b/bin/mimalloc-redirect32.dll
--- a/bin/mimalloc-redirect32.lib
+++ b/bin/mimalloc-redirect32.lib
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -1,6 +1,6 @@
-set(mi_version_major 2)
-set(mi_version_minor 2)
-set(mi_version_patch 5)
+set(mi_version_major 1)
+set(mi_version_minor 9)
+set(mi_version_patch 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})

 set(PACKAGE_VERSION ${mi_version})
--- a/contrib/docker/alpine-arm32v7/Dockerfile
+++ b/contrib/docker/alpine-arm32v7/Dockerfile
@ -1,6 +1,6 @@
 # install from an image
 # download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
+# from: <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
 FROM scratch

 # Substitute the image name that was downloaded
--- a/contrib/docker/alpine-x86/Dockerfile
+++ b/contrib/docker/alpine-x86/Dockerfile
@ -1,28 +0,0 @@
-# install from an image
-# download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/x86>
-FROM scratch
-
-# Substitute the image name that was downloaded
-ADD alpine-minirootfs-20250108-x86.tar.gz /
-
-# Install tools
-RUN apk add build-base make cmake
-RUN apk add git
-RUN apk add vim
-
-RUN mkdir -p  /home/dev
-WORKDIR /home/dev
-
-# Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev2
-RUN mkdir -p mimalloc/out/release
-RUN mkdir -p mimalloc/out/debug
-
-# Build mimalloc debug
-WORKDIR /home/dev/mimalloc/out/debug
-RUN cmake ../.. -DMI_DEBUG_FULL=ON
-# RUN make -j
-# RUN make test
-
-CMD ["/bin/sh"]
--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@ -4,12 +4,12 @@ vcpkg_from_github(
  HEAD_REF master

  # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1).
-  REF "v${VERSION}"
-  # REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774
+  # REF "v${VERSION}"
+  REF 866ce5b89db1dbc3e66bbf89041291fd16329518

  # The sha512 is the hash of the tar.gz bundle.
-  # (To get the sha512, run `vcpkg install "mimalloc[override]" --overlay-ports=./contrib/vcpkg` and copy the sha from the error message.)
-  SHA512 5218fcd3ad285687ed3f78b4651d7d3aee92b6f28e6c563a884975e654a43c94c4e5c02c5ed0322c3d3627d83d4843df2d2d8441f09aa18d00674ca9fd657345
+  # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=<dir of this file>` and copy the sha from the error message.)
+  SHA512 0b0e5ff823c49b9534b8c32800679806c5d7c29020af058da043c3e6e36ae3c32a1cdd5a21ece97dd60bc7dd4703967f683beac435dbb8514638a6cc55e5dea8
 )

 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
@ -19,7 +19,6 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
    secure      MI_SECURE
    override    MI_OVERRIDE
    optarch     MI_OPT_ARCH
-    nooptarch   MI_NO_OPT_ARCH
    optsimd     MI_OPT_SIMD
    xmalloc     MI_XMALLOC
    asm         MI_SEE_ASM
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@ -1,7 +1,7 @@
 {
  "name": "mimalloc",
-  "version": "2.2.4",
-  "port-version": 1,
+  "version": "1.9.2",
+  "port-version": 2,
  "description": "Compact general purpose allocator with excellent performance",
  "homepage": "https://github.com/microsoft/mimalloc",
  "license": "MIT",
@ -35,9 +35,6 @@
    "optarch": {
      "description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
    },
-    "nooptarch": {
-      "description": "Do _not_ use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
-    },
    "optsimd": {
      "description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)"
    },
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@ -282,8 +282,8 @@
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
-    <ProjectReference Include="mimalloc-override-dll.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
    </ProjectReference>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 225  // major + 2 digits minor
+#define MI_MALLOC_VERSION 192   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -97,7 +97,6 @@ terms of the MIT license. A copy of the license can be found in the file

 #include <stddef.h>     // size_t
 #include <stdbool.h>    // bool
-#include <stdint.h>     // INTPTR_MAX

 #ifdef __cplusplus
 extern "C" {
@ -154,21 +153,17 @@ mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
 mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
 mi_decl_export void mi_options_print(void)    mi_attr_noexcept;

+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
                                    size_t* current_rss, size_t* peak_rss,
                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;

-
-// Generally do not use the following as these are usually called automatically
-mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
-mi_decl_export void mi_cdecl mi_process_done(void) mi_attr_noexcept;
-mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-
-
 // -------------------------------------------------------------------------------------
 // Aligned allocation
 // Note that `alignment` always follows `size` for consistency with unaligned
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -111,7 +111,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
@ -121,7 +120,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
@ -268,13 +266,6 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int6
  return current;
 #endif
 }
-static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
-  const int64_t add = *padd;
-  if (add != 0) {
-    mi_atomic_addi64_relaxed((volatile _Atomic(int64_t)*)p, add);
-  }
-}
-
 static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
  int64_t current;
  do {
@ -305,7 +296,6 @@ static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p,
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
@ -373,9 +363,8 @@ static inline void mi_atomic_yield(void) {
  _mm_pause();
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
-      (defined(__x86_64__) || defined(__i386__) || \
-       defined(__aarch64__) || defined(__arm__) || \
-       defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__))
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
  __asm__ volatile ("pause" ::: "memory");
@ -384,16 +373,10 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
  __asm__ volatile("wfe");
 }
-#elif defined(__arm__)
-#if __ARM_ARCH >= 7
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
 static inline void mi_atomic_yield(void) {
  __asm__ volatile("yield" ::: "memory");
 }
-#else
-static inline void mi_atomic_yield(void) {
-  __asm__ volatile ("nop" ::: "memory");
-}
-#endif
 #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
 #ifdef __APPLE__
 static inline void mi_atomic_yield(void) {
@ -404,6 +387,10 @@ static inline void mi_atomic_yield(void) {
  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
 #endif
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
 #endif
 #elif defined(__sun)
 // Fallback for other archs
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@ -8,6 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_INTERNAL_H
 #define MIMALLOC_INTERNAL_H

+
 // --------------------------------------------------------------------------
 // This file contains the internal API's of mimalloc and various utility
 // functions and macros.
@ -16,88 +17,50 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "types.h"
 #include "track.h"

-
-// --------------------------------------------------------------------------
-// Compiler defines
-// --------------------------------------------------------------------------
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
 #define mi_trace_message(...)
 #endif

-#define mi_decl_cache_align     mi_decl_align(64)
-
+#define MI_CACHE_LINE          64
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
-#define mi_decl_align(a)        __declspec(align(a))
-#define mi_decl_noreturn        __declspec(noreturn)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
-#define mi_decl_align(a)        __attribute__((aligned(a)))
-#define mi_decl_noreturn        __attribute__((noreturn))
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
 #define mi_decl_weak            __attribute__((weak))
 #define mi_decl_hidden          __attribute__((visibility("hidden")))
-#if (__GNUC__ >= 4) || defined(__clang__)
-#define mi_decl_cold            __attribute__((cold))
-#else
-#define mi_decl_cold
-#endif
 #elif __cplusplus >= 201103L    // c++11
 #define mi_decl_noinline
 #define mi_decl_thread          thread_local
-#define mi_decl_align(a)        alignas(a)
-#define mi_decl_noreturn        [[noreturn]]
+#define mi_decl_cache_align     alignas(MI_CACHE_LINE)
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_align(a)
-#define mi_decl_noreturn
+#define mi_decl_cache_align
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
-#define mi_likely(x)       (__builtin_expect(!!(x),true))
-#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-#define mi_unlikely(x)     (x) [[unlikely]]
-#define mi_likely(x)       (x) [[likely]]
-#else
-#define mi_unlikely(x)     (x)
-#define mi_likely(x)       (x)
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x)    0
-#endif
-
-#if defined(__cplusplus)
-#define mi_decl_externc     extern "C"
-#else
-#define mi_decl_externc
 #endif

 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
 #define __wasi__
 #endif

-
-// --------------------------------------------------------------------------
-// Internal functions
-// --------------------------------------------------------------------------
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif

 // "libc.c"
 #include    <stdarg.h>
@ -133,10 +96,10 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);

 // init.c
-extern mi_decl_hidden mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
-void        _mi_auto_process_init(void);
-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept;
+void        _mi_process_load(void);
+void mi_cdecl _mi_process_done(void);
 bool        _mi_is_redirected(void);
 bool        _mi_allocator_init(const char** message);
 void        _mi_allocator_done(void);
@ -154,7 +117,6 @@ void        _mi_heap_guarded_init(mi_heap_t* heap);
 // os.c
 void        _mi_os_init(void);                                            // called from process init
 void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
-void*       _mi_os_zalloc(size_t size, mi_memid_t* memid);
 void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
 void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);

@ -164,14 +126,12 @@ bool        _mi_os_has_overcommit(void);
 bool        _mi_os_has_virtual_reserve(void);

 bool        _mi_os_reset(void* addr, size_t size);
+bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
 bool        _mi_os_decommit(void* addr, size_t size);
+bool        _mi_os_protect(void* addr, size_t size);
 bool        _mi_os_unprotect(void* addr, size_t size);
 bool        _mi_os_purge(void* p, size_t size);
 bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size);
-void        _mi_os_reuse(void* p, size_t size);
-mi_decl_nodiscard bool _mi_os_commit(void* p, size_t size, bool* is_zero);
-mi_decl_nodiscard bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
-bool        _mi_os_protect(void* addr, size_t size);

 void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
 void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
@ -179,10 +139,8 @@ void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t
 void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 bool        _mi_os_use_large_page(size_t size, size_t alignment);
 size_t      _mi_os_large_page_size(void);
-void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);

-int         _mi_os_numa_node_count(void);
-int         _mi_os_numa_node(void);
+void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);

 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
@ -219,11 +177,10 @@ void        _mi_segment_map_freed_at(const mi_segment_t* segment);
 void        _mi_segment_map_unsafe_destroy(void);

 // "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
-void       _mi_segment_collect(mi_segment_t* segment, bool force);
+mi_page_t*  _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld);
+void        _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void        _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+uint8_t*    _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);

 #if MI_HUGE_PAGE_ABANDON
 void        _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@ -231,11 +188,10 @@ void        _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, m
 void        _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif

-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
-bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
-bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+void        _mi_segments_collect(bool force, mi_segments_tld_t* tld);
+void        _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+bool        _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
+bool        _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);

 // "page.c"
 void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@ -258,7 +214,6 @@ void        _mi_deferred_free(mi_heap_t* heap, bool force);
 void        _mi_page_free_collect(mi_page_t* page,bool force);
 void        _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments

-size_t      _mi_page_bin(const mi_page_t* page); // for stats
 size_t      _mi_bin_size(size_t bin);            // for stats
 size_t      _mi_bin(size_t size);                // for stats

@ -275,7 +230,6 @@ bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* pa

 // "stats.c"
 void        _mi_stats_done(mi_stats_t* stats);
-void        _mi_stats_merge_thread(mi_tld_t* tld);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@ -297,6 +251,26 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif


+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
 /* -----------------------------------------------------------
  Error codes passed to `_mi_fatal_error`
  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
@ -321,32 +295,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif


-// ------------------------------------------------------
-// Assertions
-// ------------------------------------------------------
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func) mi_attr_noexcept;
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
-
-
 /* -----------------------------------------------------------
  Inlined definitions
 ----------------------------------------------------------- */
@ -393,28 +341,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
  }
 }

-// Align downwards
-static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
-  mi_assert_internal(alignment != 0);
-  uintptr_t mask = alignment - 1;
-  if ((alignment & mask) == 0) { // power of two?
-    return (sz & ~mask);
-  }
-  else {
-    return ((sz / alignment) * alignment);
-  }
-}

 // Align a pointer upwards
 static inline void* mi_align_up_ptr(void* p, size_t alignment) {
  return (void*)_mi_align_up((uintptr_t)p, alignment);
 }

-// Align a pointer downwards
-static inline void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
-}
-

 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
@ -438,7 +370,6 @@ static inline bool mi_mem_is_zero(const void* p, size_t size) {
  return true;
 }

-
 // Align a byte size to a size in _machine words_,
 // i.e. byte size == `wsize*sizeof(void*)`.
 static inline size_t _mi_wsize_from_size(size_t size) {
@ -533,44 +464,29 @@ static inline mi_segment_t* _mi_ptr_segment(const void* p) {
  #endif
 }

-static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
-  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
-  return (mi_page_t*)(s);
-}
-
-static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
-  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
-  return (mi_slice_t*)(p);
-}
-
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
  mi_assert_internal(page!=NULL);
  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
  return segment;
 }

-static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
-  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
-  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
-  mi_assert_internal(start->slice_offset == 0);
-  mi_assert_internal(start + start->slice_count > slice);
-  return start;
+// used internally
+static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
+  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */);
+  size_t idx = (size_t)diff >> segment->page_shift;
+  mi_assert_internal(idx < segment->capacity);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
 }

-// Get the page containing the pointer (performance critical as it is called in mi_free)
+// Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  mi_assert_internal(p > (void*)segment);
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
-  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
-  mi_assert_internal(idx <= segment->slice_entries);
-  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
-  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
-  mi_assert_internal(slice->slice_offset == 0);
-  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
-  return mi_slice_to_page(slice);
+  size_t idx = _mi_segment_page_idx_of(segment, p);
+  return &((mi_segment_t*)segment)->pages[idx];
 }

 // Quick page start for initialized pages
@ -593,8 +509,8 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 }

 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) ||
-                     (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE));
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
  return page->is_huge;
 }

@ -606,11 +522,7 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {

 // size of a segment
 static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
-}
-
-static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
-  return (uint8_t*)segment + mi_segment_size(segment);
+  return segment->segment_size;
 }

 // Thread free access
@ -765,13 +677,12 @@ static inline bool mi_is_in_same_segment(const void* p, const void* q) {
 }

 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  if (_mi_ptr_segment(q) != segment) return false;
-  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  size_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  size_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
 }

 static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
@ -853,50 +764,6 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
 }


-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = 0;
-  }
-}
-
-static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = ~((size_t)0);
-  }
-}
-
-static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != 0) return false;
-  }
-  return true;
-}
-
-static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != ~((size_t)0)) return false;
-  }
-  return true;
-}
-
-// defined in `segment.c`:
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
-
-#define mi_commit_mask_foreach(cm,idx,count) \
-  idx = 0; \
-  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
-
-#define mi_commit_mask_foreach_end() \
-    idx += count; \
-  }
-
-
-
 /* -----------------------------------------------------------
  memory id's
 ----------------------------------------------------------- */
@ -912,10 +779,8 @@ static inline mi_memid_t _mi_memid_none(void) {
  return _mi_memid_create(MI_MEM_NONE);
 }

-static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
-  memid.mem.os.base = base;
-  memid.mem.os.size = size;
  memid.initially_committed = committed;
  memid.initially_zero = is_zero;
  memid.is_pinned = is_large;
@ -947,6 +812,24 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
  return x;
 }

+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(void);
+size_t _mi_os_numa_node_count_get(void);
+
+extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get();
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+


 // -----------------------------------------------------------------------
@ -987,7 +870,7 @@ static inline size_t mi_clz(size_t x) {
  #else
    _BitScanReverse64(&idx, x);
  #endif
-  return ((MI_SIZE_BITS - 1) - (size_t)idx);
+  return ((MI_SIZE_BITS - 1) - idx);
 }
 static inline size_t mi_ctz(size_t x) {
  if (x==0) return MI_SIZE_BITS;
@ -997,7 +880,7 @@ static inline size_t mi_ctz(size_t x) {
  #else
    _BitScanForward64(&idx, x);
  #endif
-  return (size_t)idx;
+  return idx;
 }

 #else
@ -1064,21 +947,6 @@ static inline size_t mi_bsr(size_t x) {
  return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x));
 }

-size_t _mi_popcount_generic(size_t x);
-
-static inline size_t mi_popcount(size_t x) {
-  if (x<=1) return x;
-  if (x==SIZE_MAX) return MI_SIZE_BITS;
-  #if defined(__GNUC__)
-    #if (SIZE_MAX == ULONG_MAX)
-      return __builtin_popcountl(x);
-    #else
-      return __builtin_popcountll(x);
-    #endif
-  #else
-    return _mi_popcount_generic(x);
-  #endif
-}

 // ---------------------------------------------------------------------------------
 // Provide our own `_mi_memcpy` for potential performance optimizations.
@ -1090,8 +958,8 @@ static inline size_t mi_popcount(size_t x) {

 #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-extern mi_decl_hidden bool _mi_cpu_has_fsrm;
-extern mi_decl_hidden bool _mi_cpu_has_erms;
+extern bool _mi_cpu_has_fsrm;
+extern bool _mi_cpu_has_erms;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
  if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@ -59,15 +59,10 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
 // pre: needs_recommit != NULL
 int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);

-// Reset memory. The range keeps being accessible but the content might be reset to zero at any moment.
+// Reset memory. The range keeps being accessible but the content might be reset.
 // Returns error code or 0 on success.
 int _mi_prim_reset(void* addr, size_t size);

-// Reuse memory. This is called for memory that is already committed but
-// may have been reset (`_mi_prim_reset`) or decommitted (`_mi_prim_decommit`) where `needs_recommit` was false.
-// Returns error code or 0 on success. On most platforms this is a no-op.
-int _mi_prim_reuse(void* addr, size_t size);
-
 // Protect memory. Returns error code or 0 on success.
 int _mi_prim_protect(void* addr, size_t size, bool protect);

@ -123,6 +118,9 @@ void _mi_prim_thread_done_auto_done(void);
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);


+
+
+
 //-------------------------------------------------------------------
 // Access to TLS (thread local storage) slots.
 // We need fast access to both a unique thread id (in `free.c:mi_free`) and
@ -210,19 +208,19 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS)

 // On windows we can store the thread-local heap at a fixed TLS slot to avoid
-// thread-local initialization checks in the fast path.
-// We allocate a user TLS slot at process initialization (see `windows/prim.c`)
-// and store the offset `_mi_win_tls_offset`.
-#define MI_HAS_TLS_SLOT  1              // 2 = we can reliably initialize the slot (saving a test on each malloc)
+// thread-local initialization checks in the fast path. This uses a fixed location
+// in the TCB though (last user-reserved slot by default) which may clash with other applications.

-extern mi_decl_hidden size_t _mi_win_tls_offset;
+#define MI_HAS_TLS_SLOT      2              // 2 = we can reliably initialize the slot (saving a test on each malloc)

 #if MI_WIN_USE_FIXED_TLS > 1
 #define MI_TLS_SLOT     (MI_WIN_USE_FIXED_TLS)
 #elif MI_SIZE_SIZE == 4
-#define MI_TLS_SLOT     (0x0E10 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x710)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0xF0C)             // Last TlsSlot (might clash with other app reserved slot)
 #else
-#define MI_TLS_SLOT     (0x1480 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x888)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0x1678)            // Last TlsSlot (might clash with other app reserved slot)
 #endif

 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
@ -271,8 +269,8 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce


 // defined in `init.c`; do not use these directly
-extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?

 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;

@ -400,7 +398,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {

 #elif defined(MI_TLS_PTHREAD)

-extern mi_decl_hidden pthread_key_t _mi_heap_default_key;
+extern pthread_key_t _mi_heap_default_key;
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -13,9 +13,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
 // mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated. A segment is divided in slices (64KiB) from
-//                  which pages are allocated.
-// mi_page_t      : a "mimalloc" page (usually 64KiB or 512KiB) from
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
@ -67,10 +66,10 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_DEBUG 2  // + internal assertion checks
 // #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
-#if defined(MI_BUILD_RELEASE) || defined(NDEBUG)
-#define MI_DEBUG 0
-#else
+#if !defined(NDEBUG) || defined(_DEBUG)
 #define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
 #endif
 #endif

@ -168,40 +167,38 @@ typedef int32_t  mi_ssize_t;
 // ------------------------------------------------------

 // Main tuning parameters for segment and page sizes
-// Sizes for 64-bit (usually divide by two for 32-bit)
-#ifndef MI_SEGMENT_SLICE_SHIFT
-#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
-#endif
-
-#ifndef MI_SEGMENT_SHIFT
-#if MI_INTPTR_SIZE > 4
-#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
-#else
-#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
-#endif
-#endif
-
+// Sizes for 64-bit, divide by two for 32-bit
 #ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
 #endif
 #ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
+#endif
+#ifndef MI_LARGE_PAGE_SHIFT
+#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
+#endif
+#ifndef MI_SEGMENT_SHIFT
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
 #endif

 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
 #define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
-#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
-#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024

 #define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
 #define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)

-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/8)   // 8 KiB on 64-bit
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/8)  // 64 KiB on 64-bit
-#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 16 MiB on 64-bit
+#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
+#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
+#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/8)   // 8 KiB
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/8)  // 64 KiB
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/4)   // 1 MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)

 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
@ -209,27 +206,18 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: expecting 73 bins"
 #endif

-#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
 #error "mimalloc internal: define more bins"
 #endif

 // Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE            (MI_MEDIUM_OBJ_SIZE_MAX)
+#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)

 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX            (MI_SEGMENT_SIZE >> 1)
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)

-// Maximum slice count (255) for which we can find the page for interior pointers
-#define MI_MAX_SLICE_OFFSET_COUNT         ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
-
-// we never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-// on 64-bit+ systems we also limit the maximum allocation size such that the slice count fits in 32-bits. (issue #877)
-#if (PTRDIFF_MAX > INT32_MAX) && (PTRDIFF_MAX >= (MI_SEGMENT_SLIZE_SIZE * UINT32_MAX))
-#define MI_MAX_ALLOC_SIZE   (MI_SEGMENT_SLICE_SIZE * (UINT32_MAX-1))
-#else
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
-#endif
-

 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@ -308,8 +296,8 @@ typedef uintptr_t mi_thread_free_t;
 // Notes:
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 12 words on 64-bit which helps the page index calculations
-//   (and 14 words on 32-bit, and encoded free lists add 2 words)
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@ -319,12 +307,12 @@ typedef uintptr_t mi_thread_free_t;
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
  // "owned" by the segment
-  uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
+  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`)
-                                           // padding
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+
  // layout like this to optimize access in `mi_malloc` and `mi_free`
  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
  uint16_t              reserved;          // number of blocks reserved in memory
@ -348,11 +336,12 @@ typedef struct mi_page_s {
  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
  _Atomic(uintptr_t)        xheap;

-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`

-  // 64-bit 11 words, 32-bit 13 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
  void* padding[1];
+  #endif
 } mi_page_t;


@ -365,44 +354,10 @@ typedef enum mi_page_kind_e {
  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;

-typedef enum mi_segment_kind_e {
-  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // segment with just one huge page inside.
-} mi_segment_kind_t;
-
-// ------------------------------------------------------
-// A segment holds a commit mask where a bit is set if
-// the corresponding MI_COMMIT_SIZE area is committed.
-// The MI_COMMIT_SIZE must be a multiple of the slice
-// size. If it is equal we have the most fine grained
-// decommit (but setting it higher can be more efficient).
-// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
-// be committed in one go which can be set higher than
-// MI_COMMIT_SIZE for efficiency (while the decommit mask
-// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
-// ------------------------------------------------------
-
-#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)
-#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
-#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
-#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
-#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
-
-#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
-#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
-#endif
-
-typedef struct mi_commit_mask_s {
-  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
-} mi_commit_mask_t;
-
-typedef mi_page_t  mi_slice_t;
-typedef int64_t    mi_msecs_t;
-

 // ---------------------------------------------------------------
 // a memory id tracks the provenance of arena/OS allocated memory
@ -446,57 +401,43 @@ typedef struct mi_memid_s {
 } mi_memid_t;


-// -----------------------------------------------------------------------------------------
-// Segments are large allocated memory blocks (32mb on 64 bit) from arenas or the OS.
-//
-// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
-// The start of a segment is this structure with a fixed number of slice entries (`slices`)
-// usually followed by a guard OS page and the actual allocation area with pages.
-// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`).
-// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent
-// slices part of the area, the `slice_offset` is the byte offset back to the first slice
-// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`).
-// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
-// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
-// large and huge pages span a variable amount of slices.
-
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
 typedef struct mi_subproc_s mi_subproc_t;

+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
+// Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
  // constant fields
-  mi_memid_t        memid;              // memory id for arena/OS allocation
-  bool              allow_decommit;     // can we decommmit the memory
-  bool              allow_purge;        // can we purge the memory (reset or decommit)
-  size_t            segment_size;
-  mi_subproc_t*     subproc;            // segment belongs to sub process
+  mi_memid_t           memid;            // memory id to track provenance
+  bool                 allow_decommit;
+  bool                 allow_purge;
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
+  mi_subproc_t*        subproc;          // segment belongs to sub process

  // segment fields
-  mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
-  mi_commit_mask_t  purge_mask;         // slices that can be purged
-  mi_commit_mask_t  commit_mask;        // slices that are currently committed
+  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
+  struct mi_segment_s* prev;
+  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
+  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed

-  // from here is zero initialized
-  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
-  bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)
-  bool              dont_free;          // can be temporarily true to ensure the segment is not freed
+  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)

-  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
-  size_t            used;               // count of pages in use
-  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+  size_t               used;             // count of pages in use (`used <= capacity`)
+  size_t               capacity;         // count of available pages (`#free + used`)
+  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`

  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
  struct mi_segment_s* abandoned_os_prev;

-  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.
-
  // layout like this to optimize access in `mi_free`
-  mi_segment_kind_t kind;
-  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-
-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one extra final entry for huge blocks with large alignment
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
+  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;


@ -571,6 +512,7 @@ struct mi_heap_s {
  size_t                guarded_size_min;                    // minimal size for guarded objects
  size_t                guarded_size_max;                    // maximal size for guarded objects
  size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_seed;                 // starting sample count
  size_t                guarded_sample_count;                // current sample count (counting down to 0)
  #endif
  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
@ -599,19 +541,20 @@ struct mi_subproc_s {
 // Thread Local data
 // ------------------------------------------------------

-// A "span" is is an available range of slices. The span queues keep
-// track of slice spans of at most the given `slice_count` (but more than the previous size class).
-typedef struct mi_span_queue_s {
-  mi_slice_t* first;
-  mi_slice_t* last;
-  size_t      slice_count;
-} mi_span_queue_t;
+// Milliseconds as in `int64_t` to avoid overflows
+typedef int64_t  mi_msecs_t;

-#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+// Queue of segments
+typedef struct mi_segment_queue_s {
+  mi_segment_t* first;
+  mi_segment_t* last;
+} mi_segment_queue_t;

 // Segments thread local data
 typedef struct mi_segments_tld_s {
-  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  mi_segment_queue_t  small_free;   // queue of segments with free small pages
+  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
+  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
  size_t              count;        // current number of segments;
  size_t              peak_count;   // peak number of segments
  size_t              current_size; // current size of all segments
@ -632,6 +575,7 @@ struct mi_tld_s {
 };


+
 // ------------------------------------------------------
 // Debug
 // ------------------------------------------------------
@ -646,6 +590,26 @@ struct mi_tld_s {
 #define MI_DEBUG_PADDING    (0xDE)
 #endif

+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+

 // ------------------------------------------------------
 // Statistics
@ -661,25 +625,22 @@ struct mi_tld_s {
 // add to stat keeping track of the peak
 void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
 void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
 // counters can just be increased
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);

 #if (MI_STAT)
 #define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
 #define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
 #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
 #else
 #define mi_stat_increase(stat,amount)         ((void)0)
 #define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
 #define mi_stat_counter_increase(stat,amount) ((void)0)
 #endif

 #define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_adjust_decrease(heap,stat,amount)  mi_stat_adjust_decrease( (heap)->tld->stats.stat, amount)
+

 #endif
--- a/readme.md
+++ b/readme.md
@ -12,9 +12,9 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.

-Latest release   : `v3.1.5` (beta) (2025-06-13).  
-Latest v2 release: `v2.2.4` (2025-06-09).  
-Latest v1 release: `v1.9.4` (2024-06-09).
+Latest release   : `v3.0.2` (beta) (2025-03-06).  
+Latest v2 release: `v2.2.2` (2025-03-06).  
+Latest v1 release: `v1.9.2` (2024-03-06).

 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@ -72,25 +72,18 @@ Enjoy!

 ### Branches

-* `main`: latest stable release (still based on `dev2`).
+* `master`: latest stable release (still based on `dev2`).
 * `dev`:  development branch for mimalloc v1. Use this branch for submitting PR's.
 * `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` 
          (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
          mimalloc pages that can reduce fragmentation.
-* `dev3`: development branch for mimalloc v3 beta. This branch is downstream of `dev`. This version 
-          simplifies the lock-free ownership of previous versions, and improves sharing of memory between 
-          threads. On certain large workloads this version may use (much) less memory.
+* `dev3`: development branch for mimalloc v3-beta. This branch is downstream of `dev`. This version 
+          simplifies the lock-free ownership of previous versions, has no thread-local segments any more. 
+          This improves sharing of memory between threads, and on certain large workloads may use less memory 
+          with less fragmentation.

 ### Releases

-* 2025-06-13, `v3.1.5`: Bug fix release where memory was not always correctly committed (issue #1098).
-* 2025-06-09, `v1.9.4`, `v2.2.4`, `v3.1.4` (beta) : Some important bug fixes, including a case where OS memory
-  was not always fully released. Improved v3 performance, build on XBox, fix build on Android, support interpose 
-  for older macOS versions, use MADV_FREE_REUSABLE on macOS, always check commit success, better support for Windows 
-  fixed TLS offset, etc.
-* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta) : Various small bug and build fixes, including:
-  fix arm32 pre v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
-  fix execution on non BMI1 x64 systems. 
 * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
  Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
  Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. 
@ -104,13 +97,53 @@ Enjoy!
  add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
+* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing).
+  New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
+
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
+
+* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
+
+* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
+  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
+  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
+
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
+  Support arbitrary large alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
+  detection. Initial
+  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.

 * [Older release notes](#older-release-notes)

 Special thanks to:

-* Sergiy Kuryata for his contributions on reducing memory commit -- especially on Windows with the Windows thread pool (now implemented in v3).
-* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his _many_ contributions, and making
+* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
  mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
 * Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
  memory model bugs using the [genMC] model checker.
@ -141,7 +174,7 @@ mimalloc is used in various large scale low-latency services and programs, for e

 Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
 The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
-`mimalloc-override-dll` project builds a DLL for overriding malloc
+`mimalloc-override-dll` project builds DLL for overriding malloc
 in the entire program.

 ## Linux, macOS, BSD, etc.
@ -865,48 +898,6 @@ provided by the bot. You will only need to do this once across all repos using o

 # Older Release Notes

-* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
-* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
-  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
-  directly available (and new `block_size_shift` to improve aligned block free-ing).
-  New approach to collection of abandoned segments: When
-  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
-  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
-  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
-  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
-  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
-
-* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
-  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
-  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
-
-* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
-
-* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
-  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
-  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
-
-* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
-  Support arbitrary large alignments (in particular for `std::pmr` pools).
-  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
-  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
-  Various small bug fixes.
-
-* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
-  detection. Initial
-  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
-
-* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
-  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
-  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
-  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
-
-* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
-  Windows 11, fix compilation with musl, potentially reduced
-  committed memory, add `bin/minject` for Windows,
-  improved wasm support, faster aligned allocation,
-  various small fixes.
-
 * 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
  M1), improved performance for v2 for large objects, Python integration improvements, more standard
  installation directories, various small fixes.
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -115,7 +115,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t

  // now zero the block if needed
  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only the memory from the start of the large block is defined
+    // for the tracker, on huge aligned allocations only from the start of the large block is defined
    mi_track_mem_undefined(aligned_p, size);
    if (zero) {
      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
@ -191,6 +191,9 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
      if mi_likely(is_aligned)
      {
+        #if MI_STAT>1
+        mi_heap_stat_increase(heap, malloc_requested, size);
+        #endif
        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
        mi_assert_internal(p != NULL);
        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
@ -217,11 +220,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap,
  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }

-// ensure a definition is emitted
-#if defined(__cplusplus)
-void* _mi_extern_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned;
-#endif
-
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -71,20 +71,24 @@ typedef void* mi_nothrow_t;
  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)

-  #define MI_INTERPOSE_DECLS(name)        __attribute__((used)) static struct mi_interpose_s name[]  __attribute__((section("__DATA, __interpose")))
-
-  MI_INTERPOSE_DECLS(_mi_interposes) =
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
  {
    MI_INTERPOSE_MI(malloc),
    MI_INTERPOSE_MI(calloc),
    MI_INTERPOSE_MI(realloc),
    MI_INTERPOSE_MI(strdup),
+    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
+    MI_INTERPOSE_MI(strndup),
+    #endif
    MI_INTERPOSE_MI(realpath),
    MI_INTERPOSE_MI(posix_memalign),
    MI_INTERPOSE_MI(reallocf),
    MI_INTERPOSE_MI(valloc),
    MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
    MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    MI_INTERPOSE_MI(aligned_alloc),
+    #endif
    #ifdef MI_OSX_ZONE
    // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
    MI_INTERPOSE_MI(free),
@ -95,12 +99,6 @@ typedef void* mi_nothrow_t;
    MI_INTERPOSE_FUN(vfree,mi_cfree),
    #endif
  };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_7) __OSX_AVAILABLE(10.7) = {
-    MI_INTERPOSE_MI(strndup),
-  };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_15) __OSX_AVAILABLE(10.15) = {
-    MI_INTERPOSE_MI(aligned_alloc),
-  };

  #ifdef __cplusplus
  extern "C" {
--- a/src/alloc.c
+++ b/src/alloc.c
@ -30,7 +30,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(size >= MI_PADDING_SIZE);
  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);

  // check the free list
@ -83,13 +82,12 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_

  #if (MI_STAT>0)
  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
    mi_heap_stat_increase(heap, malloc_normal, bsize);
    mi_heap_stat_counter_increase(heap, malloc_normal_count, 1);
    #if (MI_STAT>1)
    const size_t bin = _mi_bin(bsize);
    mi_heap_stat_increase(heap, malloc_bins[bin], 1);
-    mi_heap_stat_increase(heap, malloc_requested, size - MI_PADDING_SIZE);
    #endif
  }
  #endif
@ -148,6 +146,12 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);
  mi_track_malloc(p,size,zero);

+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
+  }
+  #endif
  #if MI_DEBUG>3
  if (p != NULL && zero) {
    mi_assert_expensive(mi_mem_is_zero(p, size));
@ -184,6 +188,12 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
    mi_track_malloc(p,size,zero);

+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
+    }
+    #endif
    #if MI_DEBUG>3
    if (p != NULL && zero) {
      mi_assert_expensive(mi_mem_is_zero(p, size));
@ -630,7 +640,7 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
    // give up to place it right in front of the guard page if the offset is too large for unalignment
    offset = MI_BLOCK_ALIGNMENT_MAX;
  }
-  void* p = (uint8_t*)block + offset;
+  void* p = (uint8_t*)block + offset;  
  mi_track_align(block, p, offset, obj_size);
  mi_track_mem_defined(block, sizeof(mi_block_t));
  return p;
@ -652,12 +662,11 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
  void* const p   = mi_block_ptr_set_guarded(block, obj_size);

  // stats
-  mi_track_malloc(p, size, zero);
+  mi_track_malloc(p, size, zero);  
  if (p != NULL) {
    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
    #if MI_STAT>1
-    mi_heap_stat_adjust_decrease(heap, malloc_requested, req_size);
-    mi_heap_stat_increase(heap, malloc_requested, size);
+    mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p));
    #endif
    _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1);
  }
@ -685,7 +694,7 @@ void* _mi_externs[] = {
  (void*)&mi_zalloc_small,
  (void*)&mi_heap_malloc,
  (void*)&mi_heap_zalloc,
-  (void*)&mi_heap_malloc_small,
+  (void*)&mi_heap_malloc_small
  // (void*)&mi_heap_alloc_new,
  // (void*)&mi_heap_alloc_new_n
 };
--- a/src/arena.c
+++ b/src/arena.c
@ -44,7 +44,7 @@ typedef struct mi_arena_s {
  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
  _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be purged from `blocks_purge`.
-
+  
  mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
  mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
  mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
@ -99,10 +99,6 @@ bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_i
  }
 }

-bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
-  return (memid.memkind == MI_MEM_OS);
-}
-
 size_t mi_arena_get_count(void) {
  return mi_atomic_load_relaxed(&mi_arena_count);
 }
@ -192,9 +188,14 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
  if (p != NULL) return p;

  // or fall back to the OS
-  p = _mi_os_zalloc(size, memid);
+  p = _mi_os_alloc(size, memid);
  if (p == NULL) return NULL;

+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
  return p;
 }

@ -254,7 +255,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar

  // set the dirty bits (todo: no need for an atomic op here?)
  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
-    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL, NULL);
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
  }

  // set commit state
@ -265,36 +266,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
  else if (commit) {
    // commit requested, but the range may not be committed as a whole: ensure it is committed now
    memid->initially_committed = true;
-    const size_t commit_size = mi_arena_block_size(needed_bcount);      
    bool any_uncommitted;
-    size_t already_committed = 0;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted, &already_committed);
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
    if (any_uncommitted) {
-      mi_assert_internal(already_committed < needed_bcount);
-      const size_t stat_commit_size = commit_size - mi_arena_block_size(already_committed);
      bool commit_zero = false;
-      if (!_mi_os_commit_ex(p, commit_size, &commit_zero, stat_commit_size)) {
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero)) {
        memid->initially_committed = false;
      }
      else {
        if (commit_zero) { memid->initially_zero = true; }
      }
    }
-    else {
-      // all are already committed: signal that we are reusing memory in case it was purged before
-      _mi_os_reuse( p, commit_size );
-    }
  }
  else {
    // no need to commit, but check if already fully committed
-    size_t already_committed = 0;
-    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &already_committed);
-    if (!memid->initially_committed && already_committed > 0) {
-      // partially committed: as it will be committed at some time, adjust the stats and pretend the range is fully uncommitted.
-      mi_assert_internal(already_committed < needed_bcount);
-      _mi_stat_decrease(&_mi_stats_main.committed, mi_arena_block_size(already_committed));
-      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
-    }
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
  }

  return p;
@ -368,7 +354,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id)
 {
  if (_mi_preloading()) return false;  // use OS only while pre loading
-
+  
  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
  if (arena_count > (MI_MAX_ARENAS - 4)) return false;

@ -410,7 +396,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset

  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0)
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) 
    {
      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
      if (p != NULL) return p;
@ -478,19 +464,17 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks)
  const size_t size = mi_arena_block_size(blocks);
  void* const p = mi_arena_block_start(arena, bitmap_idx);
  bool needs_recommit;
-  size_t already_committed = 0;
-  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx, &already_committed)) {
+  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
    // all blocks are committed, we can purge freely
-    mi_assert_internal(already_committed == blocks);
    needs_recommit = _mi_os_purge(p, size);
  }
  else {
    // some blocks are not committed -- this can happen when a partially committed block is freed
    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory).
-    mi_assert_internal(already_committed < blocks);
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, mi_arena_block_size(already_committed));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, 0);    
  }

  // clear the purged blocks
@ -524,7 +508,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
    else {
      // already an expiration was set
    }
-    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL, NULL);
+    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
  }
 }

@ -559,7 +543,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
  // check pre-conditions
  if (arena->memid.is_pinned) return false;
-
+   
  // expired yet?
  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
  if (!force && (expire == 0 || expire > now)) return false;
@ -614,7 +598,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
  return any_purged;
 }

-static void mi_arenas_try_purge( bool force, bool visit_all )
+static void mi_arenas_try_purge( bool force, bool visit_all ) 
 {
  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled

@ -631,7 +615,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all )
  mi_atomic_guard(&purge_guard)
  {
    // increase global expire: at most one purge per delay cycle
-    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());
+    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());  
    size_t max_purge_count = (visit_all ? max_arena : 2);
    bool all_visited = true;
    for (size_t i = 0; i < max_arena; i++) {
@ -664,16 +648,15 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
  if (p==NULL) return;
  if (size==0) return;
  const bool all_committed = (committed_size == size);
-  const size_t decommitted_size = (committed_size <= size ? size - committed_size : 0);

  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
  mi_track_mem_undefined(p,size);

  if (mi_memkind_is_os(memid.memkind)) {
    // was a direct OS allocation, pass through
-    if (!all_committed && decommitted_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will decrease commit by the full size)
-      _mi_stat_increase(&_mi_stats_main.committed, decommitted_size);
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
    }
    _mi_os_free(p, size, memid);
  }
@ -707,14 +690,14 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
      mi_assert_internal(arena->blocks_purge != NULL);

      if (!all_committed) {
-        // mark the entire range as no longer committed (so we will recommit the full range when re-using)
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
        mi_track_mem_noaccess(p,size);
-        //if (committed_size > 0) {
+        if (committed_size > 0) {
          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we do no longer decrease the commit if the range is not marked entirely as committed.
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        //}
+        }
        // note: if not all committed, it may be that the purge will reset/decommit the entire range
        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
        // works (as we should never reset decommitted parts).
@ -950,7 +933,7 @@ void mi_debug_show_arenas(void) mi_attr_noexcept {
  for (size_t i = 0; i < max_arenas; i++) {
    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
    if (arena == NULL) break;
-    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, (size_t)(MI_ARENA_BLOCK_SIZE / MI_MiB), arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
    if (show_inuse) {
      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
    }
@ -1010,17 +993,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
  if (pages == 0) return 0;

  // pages per numa node
-  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
-  if (numa_count == 0) numa_count = 1;
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
  const size_t pages_per = pages / numa_count;
  const size_t pages_mod = pages % numa_count;
  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);

  // reserve evenly among numa nodes
-  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
    size_t node_pages = pages_per;  // can be 0
-    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
    if (err) return err;
    if (pages < node_pages) {
      pages = 0;
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -34,17 +34,17 @@ static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
 }


+
 /* -----------------------------------------------------------
  Claim a bit sequence atomically
 ----------------------------------------------------------- */

 // Try to atomically claim a sequence of `count` bits in a single
 // field at `idx` in `bitmap`. Returns `true` on success.
-inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
 {
  mi_assert_internal(bitmap_idx != NULL);
  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
  mi_bitmap_field_t* field = &bitmap[idx];
  size_t map  = mi_atomic_load_relaxed(field);
  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
@ -94,9 +94,9 @@ inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, cons
  return false;
 }

-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
  size_t idx = start_field_idx;
  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
@ -108,24 +108,6 @@ bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fiel
  return false;
 }

-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, 
-            const size_t start_field_idx, const size_t count, 
-            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,            
-            mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { 
-        return true;
-      }
-      // predicate returned false, unclaim and look further
-      _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx);
-    }
-  }
-  return false;
-}

 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
@ -246,7 +228,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit

  // intermediate fields
  while (++field < final_field) {
-    newmap = MI_BITMAP_FIELD_FULL;
+    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
    map = 0;
    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
  }
@ -268,7 +250,7 @@ rollback:
  // (we just failed to claim `field` so decrement first)
  while (--field > initial_field) {
    newmap = 0;
-    map = MI_BITMAP_FIELD_FULL;
+    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
    mi_atomic_store_release(field, newmap);
  }
@ -369,7 +351,7 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t

 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set) {
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
  size_t idx = mi_bitmap_index_field(bitmap_idx);
  size_t pre_mask;
  size_t mid_mask;
@ -377,31 +359,28 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
  bool all_zero = true;
  bool any_zero = false;
-  size_t one_count = 0;
  _Atomic(size_t)*field = &bitmap[idx];
  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & pre_mask); }
+  if ((prev & pre_mask) != 0) all_zero = false;
  if ((prev & pre_mask) != pre_mask) any_zero = true;
  while (mid_count-- > 0) {
    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & mid_mask); }
+    if ((prev & mid_mask) != 0) all_zero = false;
    if ((prev & mid_mask) != mid_mask) any_zero = true;
  }
  if (post_mask!=0) {
    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & post_mask); }
+    if ((prev & post_mask) != 0) all_zero = false;
    if ((prev & post_mask) != post_mask) any_zero = true;
  }
  if (pany_zero != NULL) { *pany_zero = any_zero; }
-  if (already_set != NULL) { *already_set = one_count; };
-  mi_assert_internal(all_zero ? one_count == 0 : one_count <= count);
  return all_zero;
 }


 // Returns `true` if all `count` bits were 1.
 // `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones, size_t* already_set) {
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
  size_t idx = mi_bitmap_index_field(bitmap_idx);
  size_t pre_mask;
  size_t mid_mask;
@ -409,33 +388,30 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field
  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
  bool all_ones = true;
  bool any_ones = false;
-  size_t one_count = 0;
  mi_bitmap_field_t* field = &bitmap[idx];
  size_t prev = mi_atomic_load_relaxed(field++);
  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & pre_mask); }
+  if ((prev & pre_mask) != 0) any_ones = true;
  while (mid_count-- > 0) {
    prev = mi_atomic_load_relaxed(field++);
    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & mid_mask); }
+    if ((prev & mid_mask) != 0) any_ones = true;
  }
  if (post_mask!=0) {
    prev = mi_atomic_load_relaxed(field);
    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & post_mask); }
+    if ((prev & post_mask) != 0) any_ones = true;
  }
  if (pany_ones != NULL) { *pany_ones = any_ones; }
-  if (already_set != NULL) { *already_set = one_count; }
-  mi_assert_internal(all_ones ? one_count == count : one_count < count);
  return all_ones;
 }

-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL, already_set);
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
 }

 bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones, NULL);
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
  return any_ones;
 }
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -44,11 +44,6 @@ static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx
  return mi_bitmap_index_create_ex(idx,bitidx);
 }

-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
-  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
-}
-
 // Get the field index from a bit index.
 static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
@ -76,10 +71,6 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
 // For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);

-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg);
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx);
-
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
@ -111,9 +102,9 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t

 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set);
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);

-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set);
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);

 #endif
--- a/src/free.c
+++ b/src/free.c
@ -35,9 +35,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
  mi_check_padding(page, block);
  if (track_stats) { mi_stat_free(page, block); }
  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
-  if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  }
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
  #endif
  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned

@ -123,16 +121,10 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms

  #if (MI_DEBUG>0)
  if mi_unlikely(!mi_is_in_heap_region(p)) {
-  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
-    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
-  #else
-    {
-  #endif
-      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-        "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-        _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-      }
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
    }
  }
  #endif
@ -280,7 +272,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
  _mi_padding_shrink(page, block, sizeof(mi_block_t));

-  if (segment->kind == MI_SEGMENT_HUGE) {
+  if (segment->page_kind == MI_PAGE_HUGE) {
    #if MI_HUGE_PAGE_ABANDON
    // huge page segments are always abandoned and can be freed immediately
    _mi_segment_huge_page_free(segment, page, block);
@ -348,10 +340,7 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {

 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
  MI_UNUSED_RELEASE(size);
-  #if MI_DEBUG
-  const size_t available = _mi_usable_size(p,"mi_free_size");
-  mi_assert(p == NULL || size <= available || available == 0 /* invalid pointer */ );
-  #endif
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
  mi_free(p);
 }

@ -525,24 +514,24 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
  MI_UNUSED(block);
+#endif
  mi_heap_t* const heap = mi_heap_get_default();
  const size_t bsize = mi_page_usable_block_size(page);
-  // #if (MI_STAT>1)
-  // const size_t usize = mi_page_usable_size_of(page, block);
-  // mi_heap_stat_decrease(heap, malloc_requested, usize);
-  // #endif
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc_requested, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
    mi_heap_stat_decrease(heap, malloc_normal, bsize);
-    #if (MI_STAT > 1)
+#if (MI_STAT > 1)
    mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], 1);
-    #endif
+#endif
  }
-  //else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-  //  mi_heap_stat_decrease(heap, malloc_large, bsize);
-  //}
  else {
-    mi_heap_stat_decrease(heap, malloc_huge, bsize);
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
+    mi_heap_stat_decrease(heap, malloc_huge, bpsize);
  }
 }
 #else
--- a/src/heap.c
+++ b/src/heap.c
@ -95,11 +95,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
  mi_collect_t collect = *((mi_collect_t*)arg_collect);
  _mi_page_free_collect(page, collect >= MI_FORCE);
-  if (collect == MI_FORCE) {
-    // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment.
-    mi_segment_t* segment = _mi_page_segment(page);
-    _mi_segment_collect(segment, true /* force? */);
-  }
  if (mi_page_all_free(page)) {
    // no more used blocks, free the page.
    // note: this will free retired pages as well.
@ -132,15 +127,14 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());

  // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  const bool force_main =
-    #ifdef NDEBUG
+  if (
+  #ifdef NDEBUG
      collect == MI_FORCE
-    #else
+  #else
      collect >= MI_FORCE
-    #endif
-      && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim;
-
-  if (force_main) {
+  #endif
+    && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim)
+  {
    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
    // if all memory is freed by now, all segments should be freed.
    // note: this only collects in the current subprocess
@ -163,9 +157,8 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );

-  // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list)
-  // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
+  // collect segments (purge pages, this can be expensive so don't force on abandonment)
+  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);

  // if forced, collect thread data cache on program-exit (or shared library unload)
  if (force && is_main_thread && mi_heap_is_backing(heap)) {
@ -176,7 +169,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);

  // merge statistics
-  if (collect <= MI_FORCE) { _mi_stats_merge_thread(heap->tld); }
+  if (collect <= MI_FORCE) {
+    mi_stats_merge();
+  }
 }

 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@ -333,26 +328,20 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_

  // stats
  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
-    //if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    //  mi_heap_stat_decrease(heap, malloc_large, bsize);
-    //}
-    //else 
-    {
-      mi_heap_stat_decrease(heap, malloc_huge, bsize);
-    }
+  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, malloc_huge, bsize);
  }
-  #if (MI_STAT>0)
+#if (MI_STAT)
  _mi_page_free_collect(page, false);  // update used count
  const size_t inuse = page->used;
  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
    mi_heap_stat_decrease(heap, malloc_normal, bsize * inuse);
-    #if (MI_STAT>1)
+#if (MI_STAT>1)
    mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], inuse);
-    #endif
+#endif
  }
-  // mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
-  #endif
+  mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
+#endif

  /// pretend it is all free now
  mi_assert_internal(mi_page_thread_free(page) == NULL);
--- a/src/init.c
+++ b/src/init.c
@ -34,12 +34,13 @@ const mi_page_t _mi_page_empty = {
  MI_ATOMIC_VAR_INIT(0), // xthread_free
  MI_ATOMIC_VAR_INIT(0), // xheap
  NULL, NULL
-  , { 0 }  // padding
+  #if MI_INTPTR_SIZE==4
+  , { NULL }
+  #endif
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)

-#if (MI_SMALL_WSIZE_MAX==128)
 #if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #elif (MI_PADDING>0)
@ -47,9 +48,7 @@ const mi_page_t _mi_page_empty = {
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
 #endif
-#else
-#error "define right initialization sizes corresponding to MI_SMALL_WSIZE_MAX"
-#endif
+

 // Empty page queues for every bin
 #define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
@ -64,8 +63,8 @@ const mi_page_t _mi_page_empty = {
    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }

 #define MI_STAT_COUNT_NULL()  {0,0,0}

@ -87,18 +86,6 @@ const mi_page_t _mi_page_empty = {
  { MI_INIT74(MI_STAT_COUNT_NULL) }, \
  { MI_INIT74(MI_STAT_COUNT_NULL) }

-
-// Empty slice span queues for every bin
-#define SQNULL(sz)  { NULL, NULL, sz }
-#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
-  { SQNULL(1), \
-    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
-    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
-    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
-    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
-    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
-
-
 // --------------------------------------------------------
 // Statically allocate an empty heap as the initial
 // thread local value for the default heap,
@ -108,7 +95,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------

-mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+mi_decl_hidden mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  NULL,
  MI_ATOMIC_VAR_INIT(NULL),
  0,                // tid
@ -123,23 +110,12 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  false,            // can reclaim
  0,                // tag
  #if MI_GUARDED
-  0, 0, 0, 1,       // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY
 };

-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
-
-mi_decl_cache_align static const mi_tld_t tld_empty = {
-  0,
-  false,
-  NULL, NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, tld_empty_stats }, // segments
-  { MI_STAT_VERSION, MI_STATS_NULL }       // stats
-};

 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
  return _mi_prim_thread_id();
@ -150,10 +126,15 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;

 extern mi_decl_hidden mi_heap_t _mi_heap_main;

+static mi_decl_cache_align mi_subproc_t mi_subproc_default;
+
 static mi_decl_cache_align mi_tld_t tld_main = {
  0, false,
-  &_mi_heap_main, & _mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats }, // segments
+  &_mi_heap_main, &_mi_heap_main,
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, &mi_subproc_default,
+    &tld_main.stats
+  }, // segments
  { MI_STAT_VERSION, MI_STATS_NULL }       // stats
 };

@ -172,7 +153,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
  false,            // can reclaim
  0,                // tag
  #if MI_GUARDED
-  0, 0, 0, 0,
+  0, 0, 0, 0, 0,
  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY
@ -184,14 +165,15 @@ mi_stats_t _mi_stats_main = { MI_STAT_VERSION, MI_STATS_NULL };

 #if MI_GUARDED
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
-  heap->guarded_sample_rate  = sample_rate;
-  heap->guarded_sample_count = sample_rate;  // count down samples
-  if (heap->guarded_sample_rate > 1) {
-    if (seed == 0) {
-      seed = _mi_heap_random_next(heap);
-    }
-    heap->guarded_sample_count = (seed % heap->guarded_sample_rate) + 1;  // start at random count between 1 and `sample_rate`
+  heap->guarded_sample_seed = seed;
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
  }
+  heap->guarded_sample_rate  = sample_rate;
+  if (heap->guarded_sample_rate >= 1) {
+    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
+  }
+  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
 }

 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@ -244,6 +226,7 @@ mi_heap_t* _mi_heap_main_get(void) {
  return &_mi_heap_main;
 }

+
 /* -----------------------------------------------------------
  Sub process
 ----------------------------------------------------------- */
@ -317,6 +300,7 @@ static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];

 static mi_thread_data_t* mi_thread_data_zalloc(void) {
  // try to find thread metadata in the cache
+  bool is_zero = false;
  mi_thread_data_t* td = NULL;
  for (int i = 0; i < TD_CACHE_SIZE; i++) {
    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
@ -324,25 +308,32 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
      // found cached allocation, try use it
      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
      if (td != NULL) {
-        _mi_memzero(td, offsetof(mi_thread_data_t,memid));
-        return td;
+        break;
      }
    }
  }

  // if that fails, allocate as meta data
-  mi_memid_t memid;
-  td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
  if (td == NULL) {
-    // if this fails, try once more. (issue #257)
-    td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
+    mi_memid_t memid;
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
    if (td == NULL) {
-      // really out of memory
-      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      return NULL;
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      }
+    }
+    if (td != NULL) {
+      td->memid = memid;
+      is_zero = memid.initially_zero;
    }
  }
-  td->memid = memid;
+
+  if (td != NULL && !is_zero) {
+    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
+  }
  return td;
 }

@ -400,7 +391,7 @@ static bool _mi_thread_heap_init(void) {

 // initialize thread local data
 void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t));
+  _mi_memzero_aligned(tld,sizeof(mi_tld_t));
  tld->heap_backing = bheap;
  tld->heaps = NULL;
  tld->segments.subproc = &mi_subproc_default;
@ -441,10 +432,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {

  // free if not the main thread
  if (heap != &_mi_heap_main) {
-    // the following assertion does not always hold for huge segments as those are always treated
-    // as abondened: one may allocate it in one thread, but deallocate in another in which case
-    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
-    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+    mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
    mi_thread_data_free((mi_thread_data_t*)heap);
  }
  else {
@ -577,7 +565,7 @@ mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
 }

 // Called once by the process loader from `src/prim/prim.c`
-void _mi_auto_process_init(void) {
+void _mi_process_load(void) {
  mi_heap_main_init();
  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
@ -659,13 +647,13 @@ void mi_process_init(void) mi_attr_noexcept {
  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
    long ksize = mi_option_get(mi_option_reserve_os_memory);
    if (ksize > 0) {
-      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
    }
  }
 }

-// Called when the process is done (cdecl as it is used with `at_exit` on some platforms)
-void mi_cdecl mi_process_done(void) mi_attr_noexcept {
+// Called when the process is done (through `at_exit`)
+void mi_cdecl _mi_process_done(void) {
  // only shutdown if we were initialized
  if (!_mi_process_is_initialized) return;
  // ensure we are called once
@ -708,7 +696,3 @@ void mi_cdecl mi_process_done(void) mi_attr_noexcept {
  os_preloading = true; // don't call the C runtime anymore
 }

-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept {
-  if (_mi_option_get_fast(mi_option_destroy_on_exit)>1) return;
-  mi_process_done();
-}
--- a/src/libc.c
+++ b/src/libc.c
@ -275,60 +275,3 @@ int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
  va_end(args);
  return written;
 }
-
-
-#if MI_SIZE_SIZE == 4
-#define mi_mask_even_bits32      (0x55555555)
-#define mi_mask_even_pairs32     (0x33333333)
-#define mi_mask_even_nibbles32   (0x0F0F0F0F)
-
-// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
-static size_t mi_byte_sum32(uint32_t x) {
-  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
-  x += (x << 8);
-  x += (x << 16);
-  return (size_t)(x >> 24);
-}
-
-static size_t mi_popcount_generic32(uint32_t x) {
-  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
-  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
-  // into the lower bit-pair:
-  x = x - ((x >> 1) & mi_mask_even_bits32);
-  // add the 2-bit pair results
-  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
-  // add the 4-bit nibble results
-  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
-  // each byte now has a count of its bits, we can sum them now:
-  return mi_byte_sum32(x);
-}
-
-mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
-  return mi_popcount_generic32(x);
-}
-
-#else
-#define mi_mask_even_bits64      (0x5555555555555555)
-#define mi_mask_even_pairs64     (0x3333333333333333)
-#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
-
-// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
-static size_t mi_byte_sum64(uint64_t x) {
-  x += (x << 8);
-  x += (x << 16);
-  x += (x << 32);
-  return (size_t)(x >> 56);
-}
-
-static size_t mi_popcount_generic64(uint64_t x) {
-  x = x - ((x >> 1) & mi_mask_even_bits64);
-  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
-  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
-  return mi_byte_sum64(x);
-}
-
-mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
-  return mi_popcount_generic64(x);
-}
-#endif
-
--- a/src/options.c
+++ b/src/options.c
@ -106,11 +106,11 @@ typedef struct mi_option_desc_s {
 static mi_option_desc_t options[_mi_option_last] =
 {
  // stable options
-  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
+#if MI_DEBUG || defined(MI_SHOW_ERRORS)
  { 1, UNINIT, MI_OPTION(show_errors) },
-  #else
+#else
  { 0, UNINIT, MI_OPTION(show_errors) },
-  #endif
+#endif
  { 0, UNINIT, MI_OPTION(show_stats) },
  { MI_DEFAULT_VERBOSE, UNINIT, MI_OPTION(verbose) },

@ -129,7 +129,7 @@ static mi_option_desc_t options[_mi_option_last] =
       UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
  { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(abandoned_page_purge) },       // purge free page memory when a thread terminates
  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
 #if defined(__NetBSD__)
  { 0, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed
@ -425,14 +425,14 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 }

 static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return false;
  #endif
  return mi_recurse_enter_prim();
 }

 static void mi_recurse_exit(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return;
  #endif
  mi_recurse_exit_prim();
@ -525,7 +525,7 @@ void _mi_warning_message(const char* fmt, ...) {


 #if MI_DEBUG
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) mi_attr_noexcept {
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
  _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
  abort();
 }
--- a/src/os.c
+++ b/src/os.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -91,6 +91,21 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero);

+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+

 /* -----------------------------------------------------------
  aligned hinting
@ -152,8 +167,8 @@ static void mi_os_free_huge_os_pages(void* p, size_t size);

 static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
  mi_assert_internal((size % _mi_os_page_size()) == 0);
-  if (addr == NULL) return; // || _mi_os_is_huge_reserved(addr)
-  int err = _mi_prim_free(addr, size);  // allow size==0 (issue #1041)
+  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);
  if (err != 0) {
    _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
  }
@ -166,16 +181,15 @@ static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
  if (mi_memkind_is_os(memid.memkind)) {
    size_t csize = memid.mem.os.size;
-    if (csize==0) { csize = _mi_os_good_alloc_size(size); }
-    mi_assert_internal(csize >= size);
+    if (csize==0) { _mi_os_good_alloc_size(size); }
    size_t commit_size = (still_committed ? csize : 0);
    void* base = addr;
    // different base? (due to alignment)
    if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);
+      mi_assert(memid.mem.os.base <= addr);      
      base = memid.mem.os.base;
      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) {
+      if (memid.mem.os.size==0) { 
        csize += diff;
      }
      if (still_committed) {
@ -286,10 +300,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit

      // explicitly commit only the aligned part
      if (commit) {
-        if (!_mi_os_commit(p, size, NULL)) {
-          mi_os_prim_free(*base, over_size, 0);
-          return NULL;
-        }
+        _mi_os_commit(p, size, NULL);
      }
    }
    else  { // mmap can free inside an allocation
@ -327,11 +338,9 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
  bool os_is_large = false;
  bool os_is_zero  = false;
  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);  
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(memid->initially_committed);
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+  }
  return p;
 }

@ -347,42 +356,15 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  bool os_is_zero  = false;
  void* os_base = NULL;
  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
-  memid->mem.os.base = os_base;
-  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
-
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(_mi_is_aligned(p,alignment));
-  if (commit) { mi_assert_internal(memid->initially_committed); }  
-  return p;
-}
-
-
-mi_decl_nodiscard static void* mi_os_ensure_zero(void* p, size_t size, mi_memid_t* memid) {
-  if (p==NULL || size==0) return p;
-  // ensure committed
-  if (!memid->initially_committed) {
-    bool is_zero = false;
-    if (!_mi_os_commit(p, size, &is_zero)) {
-      _mi_os_free(p, size, *memid);
-      return NULL;
-    }
-    memid->initially_committed = true;
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    memid->mem.os.base = os_base;
+    // memid->mem.os.alignment = alignment;
+    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
  }
-  // ensure zero'd
-  if (memid->initially_zero) return p;
-  _mi_memzero_aligned(p,size);
-  memid->initially_zero = true;
  return p;
 }

-void*  _mi_os_zalloc(size_t size, mi_memid_t* memid) {
-  void* p = _mi_os_alloc(size,memid);
-  return mi_os_ensure_zero(p, size, memid);
-}
-
 /* -----------------------------------------------------------
  OS aligned allocation with an offset. This is used
  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@ -528,17 +510,6 @@ bool _mi_os_reset(void* addr, size_t size) {
 }


-void _mi_os_reuse( void* addr, size_t size ) {
-  // page align conservatively within the range
-  size_t csize = 0;
-  void* const start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return;
-  const int err = _mi_prim_reuse(start, csize);
-  if (err != 0) {
-    _mi_warning_message("cannot reuse OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
-  }
-}
-
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
@ -548,7 +519,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
  mi_os_stat_increase(purged, size);

  if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
-      !_mi_preloading())                                   // don't decommit during preloading (unsafe)
+    !_mi_preloading())                                     // don't decommit during preloading (unsafe)
  {
    bool needs_recommit = true;
    mi_os_decommit_ex(p, size, &needs_recommit, stat_size);
@ -568,6 +539,7 @@ bool _mi_os_purge(void* p, size_t size) {
  return _mi_os_purge_ex(p, size, true, size);
 }

+
 // Protect a region in memory to be not accessible.
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
  // page align conservatively within the range
@ -646,7 +618,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  if (psize != NULL) *psize = 0;
  if (pages_reserved != NULL) *pages_reserved = 0;
  size_t size = 0;
-  uint8_t* const start = mi_os_claim_huge_pages(pages, &size);
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
  if (start == NULL) return NULL; // or 32-bit systems

  // Allocate one page at the time but try to place them contiguously
@ -702,7 +674,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
  if (page != 0) {
    mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(start, size, true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
    memid->memkind = MI_MEM_OS_HUGE;
    mi_assert(memid->is_pinned);
    #ifdef MI_TRACK_ASAN
@ -724,47 +696,34 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) {
  }
 }

-
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/

-static _Atomic(size_t) mi_numa_node_count; // = 0   // cache the node count
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count

-int _mi_os_numa_node_count(void) {
-  size_t count = mi_atomic_load_acquire(&mi_numa_node_count);
-  if mi_unlikely(count == 0) {
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0 && ncount < INT_MAX) {
+    if (ncount > 0) {
      count = (size_t)ncount;
    }
    else {
-      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
-      if (n == 0 || n > INT_MAX) { count = 1; }
-                            else { count = n; }
+      count = _mi_prim_numa_node_count(); // or detect dynamically
+      if (count == 0) count = 1;
    }
-    mi_atomic_store_release(&mi_numa_node_count, count); // save it
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
    _mi_verbose_message("using %zd numa regions\n", count);
  }
-  mi_assert_internal(count > 0 && count <= INT_MAX);
-  return (int)count;
+  return count;
 }

-static int mi_os_numa_node_get(void) {
-  int numa_count = _mi_os_numa_node_count();
+int _mi_os_numa_node_get(void) {
+  size_t numa_count = _mi_os_numa_node_count();
  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
  // never more than the node count and >= 0
-  const size_t n = _mi_prim_numa_node();
-  int numa_node = (n < INT_MAX ? (int)n : 0);
+  size_t numa_node = _mi_prim_numa_node();
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return numa_node;
-}
-
-int _mi_os_numa_node(void) {
-  if mi_likely(mi_atomic_load_relaxed(&mi_numa_node_count) == 1) {
-    return 0;
-  }
-  else {
-    return mi_os_numa_node_get();
-  }
+  return (int)numa_node;
 }
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
 // include to help an IDE
-#include "mimalloc.h"
+#include "mimalloc.h"     
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #endif
@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file


 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
 }

 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
 }

 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
 }

 /* -----------------------------------------------------------
@ -58,7 +58,7 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
 static inline size_t mi_bin(size_t size) {
-  size_t wsize = _mi_wsize_from_size(size);
+  size_t wsize = _mi_wsize_from_size(size);  
 #if defined(MI_ALIGN4W)
  if mi_likely(wsize <= 4) {
    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
@ -72,7 +72,7 @@ static inline size_t mi_bin(size_t size) {
    return (wsize == 0 ? 1 : wsize);
  }
 #endif
-  else if mi_unlikely(wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
+  else if mi_unlikely(wsize > MI_LARGE_OBJ_WSIZE_MAX) {
    return MI_BIN_HUGE;
  }
  else {
@ -107,7 +107,7 @@ size_t _mi_bin_size(size_t bin) {

 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
  }
  else {
@ -136,11 +136,7 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif

-static inline bool mi_page_is_large_or_huge(const mi_page_t* page) {
-  return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page));
-}
-
-size_t _mi_page_bin(const mi_page_t* page) {
+static size_t mi_page_bin(const mi_page_t* page) {
  const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
  mi_assert_internal(bin <= MI_BIN_FULL);
  return bin;
@ -148,10 +144,10 @@ size_t _mi_page_bin(const mi_page_t* page) {

 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
  mi_assert_internal(heap!=NULL);
-  const size_t bin = _mi_page_bin(page);
+  const size_t bin = mi_page_bin(page);
  mi_page_queue_t* pq = &heap->pages[bin];
  mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
-                       (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
  return pq;
 }
@ -214,11 +210,10 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  mi_heap_t* heap = mi_page_heap(page);
-
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == queue->last)  queue->last = page->prev;
@ -240,10 +235,10 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(!mi_page_queue_contains(queue, page));
  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
  #endif
  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@ -282,8 +277,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
  mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
                     (bsize == to->block_size && mi_page_queue_is_full(from)) ||
                     (bsize == from->block_size && mi_page_queue_is_full(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to)));
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));

  mi_heap_t* heap = mi_page_heap(page);

@ -322,8 +317,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
      page->prev = to->first;
      page->next = next;
      to->first->next = page;
-      if (next != NULL) {
-        next->prev = page;
+      if (next != NULL) { 
+        next->prev = page; 
      }
      else {
        to->last = page;
--- a/src/page.c
+++ b/src/page.c
@ -37,7 +37,7 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
 }

 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);

 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
@ -82,9 +82,11 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  mi_assert_internal(page->used <= page->capacity);
  mi_assert_internal(page->capacity <= page->reserved);

+  // const size_t bsize = mi_page_block_size(page);
+  mi_segment_t* segment = _mi_page_segment(page);
  uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL));
-  mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
  //mi_assert_internal(start + page->capacity*page->block_size == page->top);

  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -112,7 +114,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  return true;
 }

-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?

 bool _mi_page_is_valid(mi_page_t* page) {
  mi_assert_internal(mi_page_is_valid_init(page));
@ -121,15 +123,14 @@ bool _mi_page_is_valid(mi_page_t* page) {
  #endif
  if (mi_page_heap(page)!=NULL) {
    mi_segment_t* segment = _mi_page_segment(page);
-
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
    #if MI_HUGE_PAGE_ABANDON
-    if (segment->kind != MI_SEGMENT_HUGE)
+    if (segment->page_kind != MI_PAGE_HUGE)
    #endif
    {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
@ -256,11 +257,10 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
-
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
  #endif

  // TODO: push on full queue immediately if it is full?
@ -274,7 +274,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
  #if !MI_HUGE_PAGE_ABANDON
  mi_assert_internal(pq != NULL);
  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
  #endif
  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
  if (page == NULL) {
@ -284,14 +284,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
  #if MI_HUGE_PAGE_ABANDON
  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
  #endif
-  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
  // a fresh page was found, initialize it
  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
  mi_assert_internal(full_block_size >= block_size);
  mi_page_init(heap, page, full_block_size, heap->tld);
  mi_heap_stat_increase(heap, pages, 1);
-  mi_heap_stat_increase(heap, page_bins[_mi_page_bin(page)], 1);
+  mi_heap_stat_increase(heap, page_bins[mi_page_bin(page)], 1);
  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
  mi_assert_expensive(_mi_page_is_valid(page));
  return page;
@ -427,7 +426,6 @@ void _mi_page_force_abandon(mi_page_t* page) {
  }
 }

-
 // Free a page with no more free blocks
 void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
  mi_assert_internal(page != NULL);
@ -445,12 +443,13 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
  mi_segments_tld_t* segments_tld = &heap->tld->segments;
  mi_page_queue_remove(pq, page);

-  // and free it  
+  // and free it
+  mi_heap_stat_decrease(heap, page_bins[mi_page_bin(page)], 1);
  mi_page_set_heap(page,NULL);
  _mi_segment_page_free(page, force, segments_tld);
 }

-#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)

 // Retire a page with no more used blocks
@ -624,7 +623,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 #if (MI_SECURE>0)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
-#define MI_MIN_EXTEND         (4)
+#define MI_MIN_EXTEND         (1)
 #endif

 // Extend the capacity (up to reserved) by initializing a free list
@ -632,15 +631,18 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
  mi_assert_expensive(mi_page_is_valid_init(page));
  #if (MI_SECURE<=2)
  mi_assert(page->free == NULL);
  mi_assert(page->local_free == NULL);
-  if (page->free != NULL) return true;
+  if (page->free != NULL) return;
  #endif
-  if (page->capacity >= page->reserved) return true;
+  if (page->capacity >= page->reserved) return;

+  size_t page_size;
+  //uint8_t* page_start =
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
  mi_stat_counter_increase(tld->stats.pages_extended, 1);

  // calculate the extend count
@ -672,7 +674,6 @@ static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  page->capacity += (uint16_t)extend;
  mi_stat_increase(tld->stats.page_committed, extend * bsize);
  mi_assert_expensive(mi_page_is_valid_init(page));
-  return true;
 }

 // Initialize a fresh page
@ -687,8 +688,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  size_t page_size;
  page->page_start = _mi_segment_page_start(segment, page, &page_size);
  mi_track_mem_noaccess(page->page_start,page_size);
-  mi_assert_internal(mi_page_block_size(page) <= page_size);
-  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
  mi_assert_internal(page_size / block_size < (1L<<16));
  page->reserved = (uint16_t)(page_size / block_size);
  mi_assert_internal(page->reserved > 0);
@ -703,7 +702,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
  }
  #endif
-  mi_assert_internal(page->is_committed);
  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
  }
@ -727,10 +725,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_expensive(mi_page_is_valid_init(page));

  // initialize an initial free list
-  if (mi_page_extend_free(heap,page,tld)) {
-    mi_assert(mi_page_immediate_available(page));
-  }
-  return;
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
 }


@ -822,18 +818,13 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
  if (page_candidate != NULL) {
    page = page_candidate;
  }
-  if (page != NULL) {
-    if (!mi_page_immediate_available(page)) {
-      mi_assert_internal(mi_page_is_expandable(page));
-      if (!mi_page_extend_free(heap, page, heap->tld)) {
-        page = NULL; // failed to extend
-      }
-    }
-    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  if (page != NULL && !mi_page_immediate_available(page)) {
+    mi_assert_internal(mi_page_is_expandable(page));
+    mi_page_extend_free(heap, page, heap->tld);
  }

  if (page == NULL) {
-    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
    if (page == NULL && first_try) {
      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
@ -911,47 +902,31 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
  General allocation
 ----------------------------------------------------------- */

-// Large and huge page allocation.
-// Huge pages contain just one block, and the segment contains just that page (as `MI_SEGMENT_HUGE`).
+// Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
  size_t block_size = _mi_os_good_alloc_size(size);
  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
-  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
  #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  mi_page_queue_t* pq = NULL;
  #else
-  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_LARGE_OBJ_SIZE_MAX+1 : block_size);
-  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
  #endif
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
  if (page != NULL) {
+    mi_assert_internal(mi_page_block_size(page) >= size);
    mi_assert_internal(mi_page_immediate_available(page));
-
-    if (is_huge) {
-      mi_assert_internal(mi_page_is_huge(page));
-      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
-      mi_assert_internal(_mi_page_segment(page)->used==1);
-      #if MI_HUGE_PAGE_ABANDON
-      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-      mi_page_set_heap(page, NULL);
-      #endif
-    }
-    else {
-      mi_assert_internal(!mi_page_is_huge(page));
-    }
-
-    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
-    /*if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, malloc_large, bsize);
-      mi_heap_stat_counter_increase(heap, malloc_large_count, 1);
-    }
-    else */
-    {
-      _mi_stat_increase(&heap->tld->stats.malloc_huge, bsize);
-      _mi_stat_counter_increase(&heap->tld->stats.malloc_huge_count, 1);
-    }
+    mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
+    mi_assert_internal(_mi_page_segment(page)->used==1);
+    #if MI_HUGE_PAGE_ABANDON
+    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_page_set_heap(page, NULL);
+    #endif
+    mi_heap_stat_increase(heap, malloc_huge, mi_page_block_size(page));
+    mi_heap_stat_counter_increase(heap, malloc_huge_count, 1);
  }
  return page;
 }
@ -962,13 +937,13 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
  // huge allocation?
  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
      return NULL;
    }
    else {
-      return mi_large_huge_page_alloc(heap,size,huge_alignment);
+      return mi_huge_page_alloc(heap,size,huge_alignment);
    }
  }
  else {
@ -1004,9 +979,9 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al

    // free delayed frees from other threads (but skip contended ones)
    _mi_heap_delayed_free_partial(heap);
-
+    
    // collect every once in a while (10000 by default)
-    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);    
    if (heap->generic_collect_count >= generic_collect) {
      heap->generic_collect_count = 0;
      mi_heap_collect(heap, false /* force? */);
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen, Alon Zakai
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -58,7 +58,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config) {
 extern void emmalloc_free(void*);

 int _mi_prim_free(void* addr, size_t size) {
-  if (size==0) return 0;
+  MI_UNUSED(size);
  emmalloc_free(addr);
  return 0;
 }
@ -114,11 +114,6 @@ int _mi_prim_reset(void* addr, size_t size) {
  return 0;
 }

-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
  return 0;
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@ -39,29 +39,29 @@ terms of the MIT license. A copy of the license can be found in the file
    #define mi_attr_destructor  __attribute__((destructor))
  #endif
  static void mi_attr_constructor mi_process_attach(void) {
-    _mi_auto_process_init();
+    _mi_process_load();
  }
  static void mi_attr_destructor mi_process_detach(void) {
-    _mi_auto_process_done();
+    _mi_process_done();
  }
 #elif defined(__cplusplus)
  // C++: use static initialization to detect process start/end
  // This is not guaranteed to be first/last but the best we can generally do?
  struct mi_init_done_t {
    mi_init_done_t() {
-      _mi_auto_process_init();
+      _mi_process_load();
    }
    ~mi_init_done_t() {
-      _mi_auto_process_done();
+      _mi_process_done();
    }
  };
  static mi_init_done_t mi_init_done;
 #else
-  #pragma message("define a way to call _mi_auto_process_init/done on your platform")
+  #pragma message("define a way to call _mi_process_load/done on your platform")
 #endif
 #endif

-// Generic allocator init/done callback
+// Generic allocator init/done callback 
 #ifndef MI_PRIM_HAS_ALLOCATOR_INIT
 bool _mi_is_redirected(void) {
  return false;
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -31,12 +31,11 @@ terms of the MIT license. A copy of the license can be found in the file

 #if defined(__linux__)
  #include <features.h>
-  #include <sys/prctl.h>    // THP disable, PR_SET_VMA
-  #if defined(__GLIBC__) && !defined(PR_SET_VMA)
-  #include <linux/prctl.h>
-  #endif
+  //#if defined(MI_NO_THP)
+  #include <sys/prctl.h>  // THP disable
+  //#endif
  #if defined(__GLIBC__)
-  #include <linux/mman.h>   // linux mmap flags
+  #include <linux/mman.h> // linux mmap flags
  #else
  #include <sys/mman.h>
  #endif
@ -70,8 +69,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MADV_FREE  POSIX_MADV_FREE
 #endif

-#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
-
+  
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is
@ -157,7 +155,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
    }
    #endif
  }
-  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
+  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
  config->has_overcommit = unix_detect_overcommit();
  config->has_partial_free = true;    // mmap can free in parts
  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@ -187,7 +185,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
 //---------------------------------------------

 int _mi_prim_free(void* addr, size_t size ) {
-  if (size==0) return 0;
  bool err = (munmap(addr, size) == -1);
  return (err ? errno : 0);
 }
@ -208,24 +205,14 @@ static int unix_madvise(void* addr, size_t size, int advice) {
  return (res==0 ? 0 : errno);
 }

-static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
-  void* p = mmap(addr, size, protect_flags, flags, fd, 0 /* offset */);
-  #if defined(__linux__) && defined(PR_SET_VMA)
-  if (p!=MAP_FAILED && p!=NULL) {
-    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, size, "mimalloc");
-  }
-  #endif
-  return p;
-}
-
-static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
  MI_UNUSED(try_alignment);
  void* p = NULL;
  #if defined(MAP_ALIGNED)  // BSD
  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
    size_t n = mi_bsr(try_alignment);
    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      p = unix_mmap_prim(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd);
+      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
        int err = errno;
        _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
@ -236,7 +223,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
  }
  #elif defined(MAP_ALIGN)  // Solaris
  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    p = unix_mmap_prim((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd);  // addr parameter is the required alignment
+    p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
    if (p!=MAP_FAILED) return p;
    // fall back to regular mmap
  }
@ -246,7 +233,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
  if (addr == NULL) {
    void* hint = _mi_os_get_aligned_hint(try_alignment, size);
    if (hint != NULL) {
-      p = unix_mmap_prim(hint, size, protect_flags, flags, fd);
+      p = mmap(hint, size, protect_flags, flags, fd, 0);
      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
        #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
        int err = 0;
@ -261,7 +248,7 @@ static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignmen
  }
  #endif
  // regular mmap
-  p = unix_mmap_prim(addr, size, protect_flags, flags, fd);
+  p = mmap(addr, size, protect_flags, flags, fd, 0);
  if (p!=MAP_FAILED) return p;
  // failed to allocate
  return NULL;
@ -332,7 +319,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
      if (large_only || lflags != flags) {
        // try large OS page allocation
        *is_large = true;
-        p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+        p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
        #ifdef MAP_HUGE_1GB
        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
          mi_huge_pages_available = false; // don't try huge 1GiB pages again
@ -340,7 +327,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
            _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
          }
          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
+          p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
        }
        #endif
        if (large_only) return p;
@ -353,7 +340,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
  // regular allocation
  if (p == NULL) {
    *is_large = false;
-    p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, flags, fd);
+    p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd);
    if (p != NULL) {
      #if defined(MADV_HUGEPAGE)
      // Many Linux systems don't allow MAP_HUGETLB but they support instead
@ -387,9 +374,6 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);
-  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
-    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
-  }

  *is_zero = true;
  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@ -414,6 +398,10 @@ static void unix_mprotect_hint(int err) {
  #endif
 }

+
+
+
+
 int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
  // commit: ensure we can access the area
  // note: we may think that *is_zero can be true since the memory
@ -429,25 +417,11 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
  return err;
 }

-int _mi_prim_reuse(void* start, size_t size) {
-  MI_UNUSED(start); MI_UNUSED(size);
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSE)
-  return unix_madvise(start, size, MADV_FREE_REUSE);
-  #endif
-  return 0;
-}
-
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
  int err = 0;
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
-    // decommit on macOS: use MADV_FREE_REUSABLE as it does immediate rss accounting (issue #1097)
-    err = unix_madvise(start, size, MADV_FREE_REUSABLE);
-    if (err) { err = unix_madvise(start, size, MADV_DONTNEED); }
-  #else
-    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-    err = unix_madvise(start, size, MADV_DONTNEED);
-  #endif  
-  #if !MI_DEBUG && MI_SECURE<=2
+  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+  err = unix_madvise(start, size, MADV_DONTNEED);
+  #if !MI_DEBUG && !MI_SECURE
    *needs_recommit = false;
  #else
    *needs_recommit = true;
@ -464,22 +438,14 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
 }

 int _mi_prim_reset(void* start, size_t size) {
-  int err = 0;
-
-  // on macOS can use MADV_FREE_REUSABLE (but we disable this for now as it seems slower)
-  #if 0 && defined(__APPLE__) && defined(MADV_FREE_REUSABLE) 
-  err = unix_madvise(start, size, MADV_FREE_REUSABLE);  
-  if (err==0) return 0;
-  // fall through
-  #endif
-
-  #if defined(MADV_FREE)
-  // Otherwise, we try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
  // will not reduce the `rss` stats in tools like `top` even though the memory is available
  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
  // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
  while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
@ -487,7 +453,7 @@ int _mi_prim_reset(void* start, size_t size) {
    err = unix_madvise(start, size, MADV_DONTNEED);
  }
  #else
-  err = unix_madvise(start, size, MADV_DONTNEED);
+  int err = unix_madvise(start, size, MADV_DONTNEED);
  #endif
  return err;
 }
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@ -149,11 +149,6 @@ int _mi_prim_reset(void* addr, size_t size) {
  return 0;
 }

-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
  return 0;
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -12,10 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr

-// xbox has no console IO
-#if !defined(WINAPI_FAMILY_PARTITION) || WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
-#define MI_HAS_CONSOLE_IO
-#endif

 //---------------------------------------------
 // Dynamically bind Windows API points for portability
@ -49,30 +45,22 @@ typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
 #define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010

 #include <winternl.h>
-typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef LONG  (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);  // avoid NTSTATUS as it is not defined on xbox (pr #1084)
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;

-// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7  (and GetNumaNodeProcessorMask is not supported on xbox)
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;

 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
 typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
 typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMask)(UCHAR Node, PULONGLONG ProcessorMask);
-typedef BOOL (__stdcall* PGetNumaHighestNodeNumber)(PULONG Node);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
 static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
-static PGetNumaNodeProcessorMask    pGetNumaNodeProcessorMask = NULL;
-static PGetNumaHighestNodeNumber    pGetNumaHighestNodeNumber = NULL;
-
-// Not available on xbox
-typedef SIZE_T(__stdcall* PGetLargePageMinimum)(VOID);
-static PGetLargePageMinimum pGetLargePageMinimum = NULL;

 // Available after Windows XP
 typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes );
@ -86,7 +74,6 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
  static bool large_initialized = false;
  if (large_initialized) return (_mi_os_large_page_size() > 0);
  large_initialized = true;
-  if (pGetLargePageMinimum==NULL) return false;  // no large page support (xbox etc.)

  // Try to see if large OS pages are supported
  // To use large pages on Windows, we first need access permission
@ -105,8 +92,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
      if (ok) {
        err = GetLastError();
        ok = (err == ERROR_SUCCESS);
-        if (ok && large_page_size != NULL && pGetLargePageMinimum != NULL) {
-          *large_page_size = (*pGetLargePageMinimum)();
+        if (ok && large_page_size != NULL) {
+          *large_page_size = GetLargePageMinimum();
        }
      }
    }
@ -162,9 +149,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
-    pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMask");
-    pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)(void (*)(void))GetProcAddress(hDll, "GetNumaHighestNodeNumber");
-    pGetLargePageMinimum = (PGetLargePageMinimum)(void (*)(void))GetProcAddress(hDll, "GetLargePageMinimum");
    // Get physical memory (not available on XP, so check dynamically)
    PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory");
    if (pGetPhysicallyInstalledSystemMemory != NULL) {
@ -368,11 +352,6 @@ int _mi_prim_reset(void* addr, size_t size) {
  return (p != NULL ? 0 : (int)GetLastError());
 }

-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
  DWORD oldprotect = 0;
  BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
@ -404,7 +383,7 @@ static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int num
    }
    SIZE_T psize = size;
    void* base = hint_addr;
-    LONG err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
    if (err == 0 && base != NULL) {
      return base;
    }
@ -458,11 +437,9 @@ size_t _mi_prim_numa_node(void) {

 size_t _mi_prim_numa_node_count(void) {
  ULONG numa_max = 0;
-  if (pGetNumaHighestNodeNumber!=NULL) {
-    (*pGetNumaHighestNodeNumber)(&numa_max);
-  }
+  GetNumaHighestNodeNumber(&numa_max);
  // find the highest node number that has actual processors assigned to it. Issue #282
-  while (numa_max > 0) {
+  while(numa_max > 0) {
    if (pGetNumaNodeProcessorMaskEx != NULL) {
      // Extended API is supported
      GROUP_AFFINITY affinity;
@ -473,10 +450,8 @@ size_t _mi_prim_numa_node_count(void) {
    else {
      // Vista or earlier, use older API that is limited to 64 processors.
      ULONGLONG mask;
-      if (pGetNumaNodeProcessorMask != NULL) {
-        if ((*pGetNumaNodeProcessorMask)((UCHAR)numa_max, &mask)) {
-          if (mask != 0) break; // found the maximum non-empty node
-        }
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
      };
    }
    // max node was invalid or had no processor assigned, try again
@ -566,21 +541,17 @@ void _mi_prim_out_stderr( const char* msg )
  if (!_mi_preloading()) {
    // _cputs(msg);  // _cputs cannot be used as it aborts when failing to lock the console
    static HANDLE hcon = INVALID_HANDLE_VALUE;
-    static bool hconIsConsole = false;
+    static bool hconIsConsole;
    if (hcon == INVALID_HANDLE_VALUE) {
-      hcon = GetStdHandle(STD_ERROR_HANDLE);
-      #ifdef MI_HAS_CONSOLE_IO
      CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
-      #endif  
    }
    const size_t len = _mi_strlen(msg);
    if (len > 0 && len < UINT32_MAX) {
      DWORD written = 0;
      if (hconIsConsole) {
-        #ifdef MI_HAS_CONSOLE_IO
        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
-        #endif      
      }
      else if (hcon != INVALID_HANDLE_VALUE) {
        // use direct write if stderr was redirected
@ -656,47 +627,19 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Process & Thread Init/Done
 //----------------------------------------------------------------

-#if MI_WIN_USE_FIXED_TLS==1
-mi_decl_cache_align size_t _mi_win_tls_offset = 0;
-#endif
-
-//static void mi_debug_out(const char* s) {
-//  HANDLE h = GetStdHandle(STD_ERROR_HANDLE);
-//  WriteConsole(h, s, (DWORD)_mi_strlen(s), NULL, NULL);
-//}
-
-static void mi_win_tls_init(DWORD reason) {
-  if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) {
-    #if MI_WIN_USE_FIXED_TLS==1  // we must allocate a TLS slot dynamically
-    if (_mi_win_tls_offset == 0 && reason == DLL_PROCESS_ATTACH) {
-      const DWORD tls_slot = TlsAlloc();  // usually returns slot 1
-      if (tls_slot == TLS_OUT_OF_INDEXES) {
-        _mi_error_message(EFAULT, "unable to allocate the a TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n");
-      }
-      _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*);
-    }
-    #endif
-    #if MI_HAS_TLS_SLOT >= 2  // we must initialize the TLS slot before any allocation
-    if (mi_prim_get_default_heap() == NULL) {
-      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
-      #if MI_DEBUG && MI_WIN_USE_FIXED_TLS==1
-      void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*)));
-      mi_assert_internal(p == (void*)&_mi_heap_empty);
-      #endif
-    }
-    #endif
-  }
-}
-
 static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
  MI_UNUSED(reserved);
  MI_UNUSED(module);
-  mi_win_tls_init(reason);
+  #if MI_TLS_SLOT >= 2
+  if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+    _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+  }
+  #endif
  if (reason==DLL_PROCESS_ATTACH) {
-    _mi_auto_process_init();
+    _mi_process_load();
  }
  else if (reason==DLL_PROCESS_DETACH) {
-    _mi_auto_process_done();
+    _mi_process_done();
  }
  else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
    _mi_thread_done(NULL);
@ -786,7 +729,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {

    static int mi_process_attach(void) {
      mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
-      atexit(&_mi_auto_process_done);
+      atexit(&_mi_process_done);
      return 0;
    }
    typedef int(*mi_crt_callback_t)(void);
@ -853,7 +796,11 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
  #endif
  mi_decl_export void _mi_redirect_entry(DWORD reason) {
    // called on redirection; careful as this may be called before DllMain
-    mi_win_tls_init(reason);
+    #if MI_TLS_SLOT >= 2
+    if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+    }
+    #endif
    if (reason == DLL_PROCESS_ATTACH) {
      mi_redirected = true;
    }
--- a/src/random.c
+++ b/src/random.c
@ -143,17 +143,13 @@ void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {

 uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
  mi_assert_internal(mi_random_is_initialized(ctx));
-  uintptr_t r;
-  do {
-    #if MI_INTPTR_SIZE <= 4
-    r = chacha_next32(ctx);
-    #elif MI_INTPTR_SIZE == 8
-    r = (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
-    #else
-    # error "define mi_random_next for this platform"
-    #endif
-  } while (r==0);
-  return r;
+  #if MI_INTPTR_SIZE <= 4
+    return chacha_next32(ctx);
+  #elif MI_INTPTR_SIZE == 8
+    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+  #else
+  # error "define mi_random_next for this platform"
+  #endif
 }


@ -167,7 +163,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
  x ^= _mi_prim_clock_now();  
  // and do a few randomization steps
  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
-  for (uintptr_t i = 0; i < max || x==0; i++, x++) {
+  for (uintptr_t i = 0; i < max; i++) {
    x = _mi_random_shuffle(x);
  }
  mi_assert_internal(x != 0);
@ -183,7 +179,7 @@ static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
    #endif
    uintptr_t x = _mi_os_random_weak(0);
-    for (size_t i = 0; i < 8; i++, x++) {  // key is eight 32-bit words.
+    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
      x = _mi_random_shuffle(x);
      ((uint32_t*)key)[i] = (uint32_t)x;
    }
--- a/src/segment-map.c
+++ b/src/segment-map.c
@ -61,7 +61,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo
  if mi_unlikely(part == NULL) {
    if (!create_on_demand) return NULL;
    mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_zalloc(sizeof(mi_segmap_part_t), &memid);
+    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
    if (part == NULL) return NULL;
    part->memid = memid;
    mi_segmap_part_t* expected = NULL;
--- a/src/segment.c
+++ b/src/segment.c
--- a/src/stats.c
+++ b/src/stats.c
@ -30,7 +30,6 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  {
    // add atomically (for abandoned pages)
    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    // if (stat == &_mi_stats_main.committed) { mi_assert_internal(current + amount >= 0); };
    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
    if (amount > 0) {
      mi_atomic_addi64_relaxed(&stat->total,amount);
@ -62,25 +61,6 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 }


-static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
-  if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed(&stat->total,amount);
-  }
-  else {
-    // adjust local
-    stat->current += amount;
-    stat->total += amount;
-  }
-}
-
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
-  mi_stat_adjust(stat, -((int64_t)amount));
-}
-

 // must be thread safe as it is called from stats_merge
 static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
@ -114,8 +94,8 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  }
  #endif
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
-  }
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);    
+  }  
 }

 #undef MI_STAT_COUNT
@ -218,15 +198,6 @@ static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int
  _mi_fprintf(out, arg, "\n");
 }

-#if MI_STAT>1
-static void mi_stat_total_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
-  _mi_fprintf(out, arg, "%10s:", msg);
-  _mi_fprintf(out, arg, "%12s", " ");  // no peak
-  mi_print_amount(stat->total, unit, out, arg);
-  _mi_fprintf(out, arg, "\n");
-}
-#endif
-
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
  _mi_fprintf(out, arg, "%10s:", msg);
  mi_print_amount(stat->total, -1, out, arg);
@ -243,7 +214,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*


 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "current   ", "block   ", "total#   ");
+  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "current   ", "unit   ", "total#   ");
 }

 #if MI_STAT>1
@ -312,20 +283,18 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  // and print using that
  mi_print_header(out,arg);
  #if MI_STAT>1
-  mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, "bin",out,arg);
+  mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, "normal",out,arg);
  #endif
  #if MI_STAT
-  mi_stat_print(&stats->malloc_normal, "binned", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
-  // mi_stat_print(&stats->malloc_large, "large", (stats->malloc_large_count.total == 0 ? 1 : -1), out, arg);
+  mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
  mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
  mi_stat_count_t total = { 0,0,0 };
  mi_stat_count_add_mt(&total, &stats->malloc_normal);
-  // mi_stat_count_add(&total, &stats->malloc_large);
  mi_stat_count_add_mt(&total, &stats->malloc_huge);
  mi_stat_print_ex(&total, "total", 1, out, arg, "");
  #endif
  #if MI_STAT>1
-  mi_stat_total_print(&stats->malloc_requested, "malloc req", 1, out, arg);
+  mi_stat_print_ex(&stats->malloc_requested, "malloc req", 1, out, arg, "");
  _mi_fprintf(out, arg, "\n");
  #endif
  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
@ -350,7 +319,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg);
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->page_searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %5i\n", "numa nodes", _mi_os_numa_node_count());
+  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());

  size_t elapsed;
  size_t user_time;
@ -361,9 +330,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  size_t peak_commit;
  size_t page_faults;
  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %5zu.%03zu s\n", "elapsed", elapsed/1000, elapsed%1000);
-  _mi_fprintf(out, arg, "%10s: user: %zu.%03zu s, system: %zu.%03zu s, faults: %zu, rss: ", "process",
-              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, page_faults );
+  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
  if (peak_commit > 0) {
    _mi_fprintf(out, arg, ", commit: ");
@ -397,10 +366,6 @@ void mi_stats_merge(void) mi_attr_noexcept {
  mi_stats_merge_from( mi_stats_get_default() );
 }

-void _mi_stats_merge_thread(mi_tld_t* tld) {
-  mi_stats_merge_from( &tld->stats );
-}
-
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
  mi_stats_merge_from(stats);
 }
@ -504,7 +469,7 @@ static bool mi_heap_buf_expand(mi_heap_buf_t* hbuf) {
    hbuf->buf[hbuf->size-1] = 0;
  }
  if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false;
-  const size_t newsize = (hbuf->size == 0 ? mi_good_size(12*MI_KiB) : 2*hbuf->size);
+  const size_t newsize = (hbuf->size == 0 ? 2*MI_KiB : 2*hbuf->size);
  char* const  newbuf  = (char*)mi_rezalloc(hbuf->buf, newsize);
  if (newbuf == NULL) return false;
  hbuf->buf = newbuf;
@ -531,12 +496,7 @@ static void mi_heap_buf_print_count_bin(mi_heap_buf_t* hbuf, const char* prefix,
  const size_t binsize = _mi_bin_size(bin);
  const size_t pagesize = (binsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_SMALL_PAGE_SIZE :
                            (binsize <= MI_MEDIUM_OBJ_SIZE_MAX ? MI_MEDIUM_PAGE_SIZE :
-                              #if MI_LARGE_PAGE_SIZE
-                              (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0)
-                              #else
-                              0
-                              #endif
-                              ));
+                              (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0)));
  char buf[128];
  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : ""));
  buf[127] = 0;
@ -629,7 +589,7 @@ char* mi_stats_get_json(size_t output_size, char* output_buf) mi_attr_noexcept {
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
    mi_heap_buf_print_count_bin(&hbuf, "    ", &stats->page_bins[i], i, i!=MI_BIN_HUGE);
  }
-  mi_heap_buf_print(&hbuf, "  ]\n");
+  mi_heap_buf_print(&hbuf, "  ]\n");  
  mi_heap_buf_print(&hbuf, "}\n");
  return hbuf.buf;
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -16,7 +16,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()

 # Import mimalloc (if installed)
-find_package(mimalloc 2.2 CONFIG REQUIRED)
+find_package(mimalloc 1.9 CONFIG REQUIRED)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_LIBRARY_DIR} (${MIMALLOC_VERSION_DIR})")


--- a/test/main-override-dep.cpp
+++ b/test/main-override-dep.cpp
@ -1,7 +1,6 @@
 // Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. 
 // This is imported by the `mimalloc-test-override` project.
 #include <string>
-#include <iostream>
 #include "main-override-dep.h"

 std::string TestAllocInDll::GetString()
@ -11,41 +10,6 @@ std::string TestAllocInDll::GetString()
 	const char* t = "test";
 	memcpy(test, t, 4);
 	std::string r = test;
-  std::cout << "override-dep: GetString: " << r << "\n";
 	delete[] test;
 	return r;
-}
-
-
-class Static {
-private:
-  void* p;
-public:
-  Static() {
-    printf("override-dep: static constructor\n");
-    p = malloc(64);
-    return;
-  }
-  ~Static() {
-    free(p);
-    printf("override-dep: static destructor\n");
-    return;
-  }
-};
-
-static Static s = Static();
-
-
-#include <windows.h>
-
-BOOL WINAPI DllMain(HINSTANCE module, DWORD reason, LPVOID reserved) {
-  (void)(reserved);
-  (void)(module);
-  if (reason==DLL_PROCESS_ATTACH) {
-    printf("override-dep: dll attach\n");
-  }
-  else if (reason==DLL_PROCESS_DETACH) {
-    printf("override-dep: dll detach\n");
-  }  
-  return TRUE;
-}
+}
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -10,6 +10,7 @@
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.

+static void mi_bins(void);

 static void double_free1();
 static void double_free2();
@ -23,12 +24,11 @@ static void test_reserved(void);
 static void negative_stat(void);
 static void alloc_huge(void);
 static void test_heap_walk(void);
-static void test_heap_arena(void);
-static void test_align(void);
 static void test_canary_leak(void);
 static void test_manage_os_memory(void);
 // static void test_large_pages(void);

+
 int main() {
  mi_version();
  mi_stats_reset();
@ -43,17 +43,15 @@ int main() {
  // corrupt_free();
  // block_overflow1();
  // block_overflow2();
-  test_canary_leak();
+  // test_canary_leak();
  // test_aslr();
  // invalid_free();
  // test_reserved();
  // negative_stat();
  // test_heap_walk();
  // alloc_huge();
-  // test_heap_walk();
-  // test_heap_arena();
-  // test_align();
-  
+
+
  void* p1 = malloc(78);
  void* p2 = malloc(24);
  free(p1);
@ -69,7 +67,7 @@ int main() {
  free(p1);
  free(p2);
  free(s);
-  
+
  /* now test if override worked by allocating/freeing across the api's*/
  //p1 = mi_malloc(32);
  //free(p1);
@ -84,13 +82,6 @@ int main() {
  return 0;
 }

-static void test_align() {
-  void* p = mi_malloc_aligned(256, 256);
-  if (((uintptr_t)p % 256) != 0) {
-    fprintf(stderr, "%p is not 256 alignend!\n", p);
-  }
-}
-
 static void invalid_free() {
  free((void*)0xBADBEEF);
  realloc((void*)0xBADBEEF,10);
@ -248,20 +239,6 @@ static void test_heap_walk(void) {
  mi_heap_visit_blocks(heap, true, &test_visit, NULL);
 }

-static void test_heap_arena(void) {
-  mi_arena_id_t arena_id;
-  int err = mi_reserve_os_memory_ex(100 * 1024 * 1024, false /* commit */, false /* allow large */, true /* exclusive */, &arena_id);
-  if (err) abort();
-  mi_heap_t* heap = mi_heap_new_in_arena(arena_id);
-  for (int i = 0; i < 500000; i++) {
-    void* p = mi_heap_malloc(heap, 1024);
-    if (p == NULL) {
-      printf("out of memory after %d kb (expecting about 100_000kb)\n", i);
-      break;
-    }
-  }
-}
-
 static void test_canary_leak(void) {
  char* p = mi_mallocn_tp(char,23);
  for(int i = 0; i < 23; i++) {
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -27,12 +27,9 @@ static void heap_late_free();         // issue #204
 static void padding_shrink();         // issue #209
 static void various_tests();
 static void test_mt_shutdown();
-static void large_alloc(void);        // issue #363
 static void fail_aslr();              // issue #372
 static void tsan_numa_test();         // issue #414
 static void strdup_test();            // issue #445
-static void bench_alloc_large(void);  // issue #xxx
-//static void test_large_migrate(void); // issue #691
 static void heap_thread_free_huge();
 static void test_std_string();        // issue #697
 static void test_thread_local();      // issue #944
@ -40,7 +37,7 @@ static void test_thread_local();      // issue #944
 static void test_mixed1();             // issue #942
 static void test_stl_allocators();

-#if _WIN32
+#if x_WIN32
 #include "main-override-dep.h"
 static void test_dep();               // issue #981: test overriding in another DLL
 #else
@ -58,20 +55,18 @@ int main() {
  //test_thread_local();
  // heap_thread_free_huge();
  /*
-   heap_thread_free_huge();
-   heap_thread_free_large();
-   heap_no_delete();
-   heap_late_free();
-   padding_shrink();
-   various_tests();
-   large_alloc();
-   tsan_numa_test();
-   strdup_test();
-  */
-  // test_stl_allocators();
-  // test_mt_shutdown();
-  // test_large_migrate();
+  heap_thread_free_large();
+  heap_no_delete();
+  heap_late_free();
+  padding_shrink();

+  tsan_numa_test();
+  */
+  /*
+  strdup_test();
+  test_stl_allocators();
+  test_mt_shutdown();
+  */
  //fail_aslr();
  mi_stats_print(NULL);
  return 0;
@ -150,12 +145,11 @@ static bool test_stl_allocator1() {
 struct some_struct { int i; int j; double z; };


-#if _WIN32
+#if x_WIN32
 static void test_dep()
 {
  TestAllocInDll t;
  std::string s = t.GetString();
-  std::cout << "test_dep GetString: " << s << "\n";
 }
 #endif

@ -364,7 +358,7 @@ static void heap_thread_free_large_worker() {

 static void heap_thread_free_large() {
  for (int i = 0; i < 100; i++) {
-    shared_p = mi_malloc_aligned(2 * 1024 * 1024 + 1, 8);
+    shared_p = mi_malloc_aligned(2*1024*1024 + 1, 8);
    auto t1 = std::thread(heap_thread_free_large_worker);
    t1.join();
  }
@ -375,13 +369,14 @@ static void heap_thread_free_huge_worker() {
 }

 static void heap_thread_free_huge() {
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 10; i++) {
    shared_p = mi_malloc(1024 * 1024 * 1024);
    auto t1 = std::thread(heap_thread_free_huge_worker);
    t1.join();
  }
 }

+
 static void test_mt_shutdown()
 {
  const int threads = 5;
@ -406,18 +401,6 @@ static void test_mt_shutdown()
  std::cout << "done" << std::endl;
 }

-// issue #363
-using namespace std;
-
-void large_alloc(void)
-{
-  char* a = new char[1ull << 25];
-  thread th([&] {
-    delete[] a;
-    });
-  th.join();
-}
-
 // issue #372
 static void fail_aslr() {
  size_t sz = (size_t)(4ULL << 40); // 4TiB
@ -438,36 +421,6 @@ static void tsan_numa_test() {
  t1.join();
 }

-// issue #?
-#include <chrono>
-#include <random>
-#include <iostream>
-
-static void bench_alloc_large(void) {
-  static constexpr int kNumBuffers = 20;
-  static constexpr size_t kMinBufferSize = 5 * 1024 * 1024;
-  static constexpr size_t kMaxBufferSize = 25 * 1024 * 1024;
-  std::unique_ptr<char[]> buffers[kNumBuffers];
-
-  std::random_device rd;  (void)rd;
-  std::mt19937 gen(42); //rd());
-  std::uniform_int_distribution<> size_distribution(kMinBufferSize, kMaxBufferSize);
-  std::uniform_int_distribution<> buf_number_distribution(0, kNumBuffers - 1);
-
-  static constexpr int kNumIterations = 2000;
-  const auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < kNumIterations; ++i) {
-    int buffer_idx = buf_number_distribution(gen);
-    size_t new_size = size_distribution(gen);
-    buffers[buffer_idx] = std::make_unique<char[]>(new_size);
-  }
-  const auto end = std::chrono::steady_clock::now();
-  const auto num_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-  const auto us_per_allocation = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / kNumIterations;
-  std::cout << kNumIterations << " allocations Done in " << num_ms << "ms." << std::endl;
-  std::cout << "Avg " << us_per_allocation << " us per allocation" << std::endl;
-}
-

 class MTest
 {
@ -494,4 +447,4 @@ void test_thread_local()
        mi_stats_print(NULL);
    }
    return;
-}
+}
--- a/test/test-api.c
+++ b/test/test-api.c
@ -203,11 +203,7 @@ int main(void) {
  CHECK_BODY("malloc-aligned9") { // test large alignments
    bool ok = true;
    void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 
-      #if SIZE_MAX > UINT32_MAX
-      2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 
-      #endif
-      0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
    for (int i = 0; i < 28 && ok; i++) {
      int align = (1 << i);
      for (int j = 0; j < 8 && ok; j++) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -320,17 +320,11 @@ int main(int argc, char** argv) {

  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
  srand(0x7feb352d);
-  
-  //mi_reserve_os_memory(512ULL << 20, true, true);
-
-  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
-  mi_stats_reset();
-  #endif
-
+  // mi_stats_reset();
 #ifdef STRESS
-  test_stress();
+    test_stress();
 #else
-  test_leak();
+    test_leak();
 #endif

 #ifndef USE_STD_MALLOC
@ -343,7 +337,6 @@ int main(int argc, char** argv) {
    mi_free(json);
  }
  #endif
-  mi_collect(true);
  mi_stats_print(NULL);  
 #endif
  //bench_end_program();