diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ce084f6..5cc7ec5c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENE
   set(MI_ARCH "x64")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
   set(MI_ARCH "arm64")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567].?|ARM)$")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
   set(MI_ARCH "arm32")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
   if(CMAKE_SIZEOF_VOID_P==4)
@@ -173,8 +173,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
     list(APPEND mi_cflags -Wall)
 endif()
 
-# force C++ compilation with msvc or clang-cl to use modern C++ atomics
-if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel" OR MI_CLANG_CL)
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
   set(MI_USE_CXX "ON")
 endif()
 
@@ -713,12 +712,10 @@ if (MI_BUILD_TESTS)
     target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
     target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})
     target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include)
-    if(MI_BUILD_STATIC AND NOT MI_DEBUG_TSAN)
-      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
-    elseif(MI_BUILD_SHARED)
+    if(MI_BUILD_SHARED AND (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
       target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
     else()
-      message(STATUS "cannot build TSAN tests without MI_BUILD_SHARED being enabled")
+      target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries})
     endif()
     add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME})
   endforeach()
@@ -727,19 +724,21 @@ if (MI_BUILD_TESTS)
   if(MI_BUILD_SHARED AND NOT (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN))
     add_executable(mimalloc-test-stress-dynamic test/test-stress.c)
     target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE ${mi_defines} "USE_STD_MALLOC=1")
+    if(WIN32)
+      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")
+    endif()
     target_compile_options(mimalloc-test-stress-dynamic PRIVATE ${mi_cflags})
     target_include_directories(mimalloc-test-stress-dynamic PRIVATE include)
+    target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # mi_version
     if(WIN32)
-      target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1")  # link mi_version
-      target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries})  # link mi_version
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 $<TARGET_FILE:mimalloc-test-stress-dynamic>)
     else()
       if(APPLE)
         set(LD_PRELOAD "DYLD_INSERT_LIBRARIES")
       else()
         set(LD_PRELOAD "LD_PRELOAD")
       endif()
-      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_VERBOSE=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
+      add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 ${LD_PRELOAD}=$<TARGET_FILE:mimalloc> $<TARGET_FILE:mimalloc-test-stress-dynamic>)
     endif()
   endif()
 endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 83d6a482..d761d8c8 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -40,14 +40,6 @@ jobs:
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -A Win32
         MSBuildConfiguration: Release
-      Debug Fixed TLS:
-        BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Debug
-      Release Fixed TLS:
-        BuildType: release
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_WIN_USE_FIXED_TLS=ON
-        MSBuildConfiguration: Release
   steps:
   - task: CMake@1
     inputs:
@@ -183,6 +175,35 @@ jobs:
 # Other OS versions (just debug mode)
 # ----------------------------------------------------------
 
+- job:
+  displayName: Windows 2019
+  pool:
+    vmImage:
+      windows-2019
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+        MSBuildConfiguration: Debug
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+        MSBuildConfiguration: Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - task: MSBuild@1
+    inputs:
+      solution: $(BuildType)/libmimalloc.sln
+      configuration: '$(MSBuildConfiguration)'
+      msbuildArguments: -m
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
 - job:
   displayName: Ubuntu 24.04
   pool:
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index dfe78468..daac7a5d 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 2)
 set(mi_version_minor 2)
-set(mi_version_patch 5)
+set(mi_version_patch 3)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/contrib/docker/alpine-arm32v7/Dockerfile b/contrib/docker/alpine-arm32v7/Dockerfile
index daa60f50..f74934fb 100644
--- a/contrib/docker/alpine-arm32v7/Dockerfile
+++ b/contrib/docker/alpine-arm32v7/Dockerfile
@@ -1,6 +1,6 @@
 # install from an image
 # download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
+# from: <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
 FROM scratch
 
 # Substitute the image name that was downloaded
diff --git a/contrib/docker/alpine-x86/Dockerfile b/contrib/docker/alpine-x86/Dockerfile
deleted file mode 100644
index a0f76c17..00000000
--- a/contrib/docker/alpine-x86/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-# install from an image
-# download first an appropriate tar.gz image into the current directory
-# from <https://github.com/alpinelinux/docker-alpine/tree/edge/x86>
-FROM scratch
-
-# Substitute the image name that was downloaded
-ADD alpine-minirootfs-20250108-x86.tar.gz /
-
-# Install tools
-RUN apk add build-base make cmake
-RUN apk add git
-RUN apk add vim
-
-RUN mkdir -p  /home/dev
-WORKDIR /home/dev
-
-# Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev2
-RUN mkdir -p mimalloc/out/release
-RUN mkdir -p mimalloc/out/debug
-
-# Build mimalloc debug
-WORKDIR /home/dev/mimalloc/out/debug
-RUN cmake ../.. -DMI_DEBUG_FULL=ON
-# RUN make -j
-# RUN make test
-
-CMD ["/bin/sh"]
diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake
index abb90af9..2e40c70d 100644
--- a/contrib/vcpkg/portfile.cmake
+++ b/contrib/vcpkg/portfile.cmake
@@ -4,12 +4,12 @@ vcpkg_from_github(
   HEAD_REF master
 
   # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1).
-  REF "v${VERSION}"
-  # REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774
+  # REF "v${VERSION}"
+  REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774
 
   # The sha512 is the hash of the tar.gz bundle.
-  # (To get the sha512, run `vcpkg install "mimalloc[override]" --overlay-ports=./contrib/vcpkg` and copy the sha from the error message.)
-  SHA512 5218fcd3ad285687ed3f78b4651d7d3aee92b6f28e6c563a884975e654a43c94c4e5c02c5ed0322c3d3627d83d4843df2d2d8441f09aa18d00674ca9fd657345
+  # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=<dir of this file>` and copy the sha from the error message.)
+  SHA512 8cbb601fdf8b46dd6a9c0d314d6da9d4960699853829e96d2470753867f90689fb4caeaf30d628943fd388670dc11902dbecc9cc7c329b99a510524a09bdb612
 )
 
 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
@@ -19,7 +19,6 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
     secure      MI_SECURE
     override    MI_OVERRIDE
     optarch     MI_OPT_ARCH
-    nooptarch   MI_NO_OPT_ARCH
     optsimd     MI_OPT_SIMD
     xmalloc     MI_XMALLOC
     asm         MI_SEE_ASM
diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json
index 42f2aa35..ce0c3456 100644
--- a/contrib/vcpkg/vcpkg.json
+++ b/contrib/vcpkg/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "mimalloc",
-  "version": "2.2.4",
+  "version": "2.2.2",
   "port-version": 1,
   "description": "Compact general purpose allocator with excellent performance",
   "homepage": "https://github.com/microsoft/mimalloc",
@@ -35,9 +35,6 @@
     "optarch": {
       "description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
     },
-    "nooptarch": {
-      "description": "Do _not_ use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
-    },
     "optsimd": {
       "description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)"
     },
diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 128a4ff6..d6af71ce 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -282,8 +282,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override-dll.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc.h b/include/mimalloc.h
index d895d925..ff6f0568 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 225  // major + 2 digits minor
+#define MI_MALLOC_VERSION 223   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -154,21 +154,17 @@ mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
 mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
 mi_decl_export void mi_options_print(void)    mi_attr_noexcept;
 
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
                                     size_t* current_rss, size_t* peak_rss,
                                     size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
 
-
-// Generally do not use the following as these are usually called automatically
-mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
-mi_decl_export void mi_cdecl mi_process_done(void) mi_attr_noexcept;
-mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-
-
 // -------------------------------------------------------------------------------------
 // Aligned allocation
 // Note that `alignment` always follows `size` for consistency with unaligned
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index e8bac316..39ff5c90 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -111,7 +111,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
@@ -121,7 +120,6 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
@@ -305,7 +303,6 @@ static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p,
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
-#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ca5be930..eae85ab6 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -8,6 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_INTERNAL_H
 #define MIMALLOC_INTERNAL_H
 
+
 // --------------------------------------------------------------------------
 // This file contains the internal API's of mimalloc and various utility
 // functions and macros.
@@ -16,88 +17,50 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "types.h"
 #include "track.h"
 
-
-// --------------------------------------------------------------------------
-// Compiler defines
-// --------------------------------------------------------------------------
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
 #define mi_trace_message(...)
 #endif
 
-#define mi_decl_cache_align     mi_decl_align(64)
-
+#define MI_CACHE_LINE          64
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
-#define mi_decl_align(a)        __declspec(align(a))
-#define mi_decl_noreturn        __declspec(noreturn)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
-#define mi_decl_align(a)        __attribute__((aligned(a)))
-#define mi_decl_noreturn        __attribute__((noreturn))
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
 #define mi_decl_weak            __attribute__((weak))
 #define mi_decl_hidden          __attribute__((visibility("hidden")))
-#if (__GNUC__ >= 4) || defined(__clang__)
-#define mi_decl_cold            __attribute__((cold))
-#else
-#define mi_decl_cold
-#endif
 #elif __cplusplus >= 201103L    // c++11
 #define mi_decl_noinline
 #define mi_decl_thread          thread_local
-#define mi_decl_align(a)        alignas(a)
-#define mi_decl_noreturn        [[noreturn]]
+#define mi_decl_cache_align     alignas(MI_CACHE_LINE)
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_align(a)
-#define mi_decl_noreturn
+#define mi_decl_cache_align
 #define mi_decl_weak
 #define mi_decl_hidden
-#define mi_decl_cold
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
-#define mi_likely(x)       (__builtin_expect(!!(x),true))
-#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-#define mi_unlikely(x)     (x) [[unlikely]]
-#define mi_likely(x)       (x) [[likely]]
-#else
-#define mi_unlikely(x)     (x)
-#define mi_likely(x)       (x)
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x)    0
-#endif
-
-#if defined(__cplusplus)
-#define mi_decl_externc     extern "C"
-#else
-#define mi_decl_externc
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
 #define __wasi__
 #endif
 
-
-// --------------------------------------------------------------------------
-// Internal functions
-// --------------------------------------------------------------------------
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
 
 // "libc.c"
 #include    <stdarg.h>
@@ -133,10 +96,10 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_hidden mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
-void        _mi_auto_process_init(void);
-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept;
+void        _mi_process_load(void);
+void mi_cdecl _mi_process_done(void);
 bool        _mi_is_redirected(void);
 bool        _mi_allocator_init(const char** message);
 void        _mi_allocator_done(void);
@@ -154,7 +117,6 @@ void        _mi_heap_guarded_init(mi_heap_t* heap);
 // os.c
 void        _mi_os_init(void);                                            // called from process init
 void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
-void*       _mi_os_zalloc(size_t size, mi_memid_t* memid);
 void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
 void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
 
@@ -164,14 +126,13 @@ bool        _mi_os_has_overcommit(void);
 bool        _mi_os_has_virtual_reserve(void);
 
 bool        _mi_os_reset(void* addr, size_t size);
+bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
+bool        _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
 bool        _mi_os_decommit(void* addr, size_t size);
+bool        _mi_os_protect(void* addr, size_t size);
 bool        _mi_os_unprotect(void* addr, size_t size);
 bool        _mi_os_purge(void* p, size_t size);
 bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size);
-void        _mi_os_reuse(void* p, size_t size);
-mi_decl_nodiscard bool _mi_os_commit(void* p, size_t size, bool* is_zero);
-mi_decl_nodiscard bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
-bool        _mi_os_protect(void* addr, size_t size);
 
 void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
 void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
@@ -179,10 +140,8 @@ void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t
 void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 bool        _mi_os_use_large_page(size_t size, size_t alignment);
 size_t      _mi_os_large_page_size(void);
-void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
-int         _mi_os_numa_node_count(void);
-int         _mi_os_numa_node(void);
+void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
@@ -258,7 +217,6 @@ void        _mi_deferred_free(mi_heap_t* heap, bool force);
 void        _mi_page_free_collect(mi_page_t* page,bool force);
 void        _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
 
-size_t      _mi_page_bin(const mi_page_t* page); // for stats
 size_t      _mi_bin_size(size_t bin);            // for stats
 size_t      _mi_bin(size_t size);                // for stats
 
@@ -275,7 +233,6 @@ bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* pa
 
 // "stats.c"
 void        _mi_stats_done(mi_stats_t* stats);
-void        _mi_stats_merge_thread(mi_tld_t* tld);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@@ -297,6 +254,26 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
 /* -----------------------------------------------------------
   Error codes passed to `_mi_fatal_error`
   All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
@@ -321,32 +298,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
-// ------------------------------------------------------
-// Assertions
-// ------------------------------------------------------
-
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func) mi_attr_noexcept;
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
-
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
-
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
-
-
-
 /* -----------------------------------------------------------
   Inlined definitions
 ----------------------------------------------------------- */
@@ -912,10 +863,8 @@ static inline mi_memid_t _mi_memid_none(void) {
   return _mi_memid_create(MI_MEM_NONE);
 }
 
-static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
   mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
-  memid.mem.os.base = base;
-  memid.mem.os.size = size;
   memid.initially_committed = committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
@@ -947,6 +896,24 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
   return x;
 }
 
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(void);
+size_t _mi_os_numa_node_count_get(void);
+
+extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get();
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+
 
 
 // -----------------------------------------------------------------------
@@ -987,7 +954,7 @@ static inline size_t mi_clz(size_t x) {
   #else
     _BitScanReverse64(&idx, x);
   #endif
-  return ((MI_SIZE_BITS - 1) - (size_t)idx);
+  return ((MI_SIZE_BITS - 1) - idx);
 }
 static inline size_t mi_ctz(size_t x) {
   if (x==0) return MI_SIZE_BITS;
@@ -997,7 +964,7 @@ static inline size_t mi_ctz(size_t x) {
   #else
     _BitScanForward64(&idx, x);
   #endif
-  return (size_t)idx;
+  return idx;
 }
 
 #else
@@ -1090,8 +1057,8 @@ static inline size_t mi_popcount(size_t x) {
 
 #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-extern mi_decl_hidden bool _mi_cpu_has_fsrm;
-extern mi_decl_hidden bool _mi_cpu_has_erms;
+extern bool _mi_cpu_has_fsrm;
+extern bool _mi_cpu_has_erms;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
     __movsb((unsigned char*)dst, (const unsigned char*)src, n);
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 1087d9b8..bddd66e9 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -59,15 +59,10 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
 // pre: needs_recommit != NULL
 int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
 
-// Reset memory. The range keeps being accessible but the content might be reset to zero at any moment.
+// Reset memory. The range keeps being accessible but the content might be reset.
 // Returns error code or 0 on success.
 int _mi_prim_reset(void* addr, size_t size);
 
-// Reuse memory. This is called for memory that is already committed but
-// may have been reset (`_mi_prim_reset`) or decommitted (`_mi_prim_decommit`) where `needs_recommit` was false.
-// Returns error code or 0 on success. On most platforms this is a no-op.
-int _mi_prim_reuse(void* addr, size_t size);
-
 // Protect memory. Returns error code or 0 on success.
 int _mi_prim_protect(void* addr, size_t size, bool protect);
 
@@ -123,6 +118,9 @@ void _mi_prim_thread_done_auto_done(void);
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 
+
+
+
 //-------------------------------------------------------------------
 // Access to TLS (thread local storage) slots.
 // We need fast access to both a unique thread id (in `free.c:mi_free`) and
@@ -210,19 +208,19 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS)
 
 // On windows we can store the thread-local heap at a fixed TLS slot to avoid
-// thread-local initialization checks in the fast path.
-// We allocate a user TLS slot at process initialization (see `windows/prim.c`)
-// and store the offset `_mi_win_tls_offset`.
-#define MI_HAS_TLS_SLOT  1              // 2 = we can reliably initialize the slot (saving a test on each malloc)
+// thread-local initialization checks in the fast path. This uses a fixed location
+// in the TCB though (last user-reserved slot by default) which may clash with other applications.
 
-extern mi_decl_hidden size_t _mi_win_tls_offset;
+#define MI_HAS_TLS_SLOT      2              // 2 = we can reliably initialize the slot (saving a test on each malloc)
 
 #if MI_WIN_USE_FIXED_TLS > 1
 #define MI_TLS_SLOT     (MI_WIN_USE_FIXED_TLS)
 #elif MI_SIZE_SIZE == 4
-#define MI_TLS_SLOT     (0x0E10 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x710)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0xF0C)             // Last TlsSlot (might clash with other app reserved slot)
 #else
-#define MI_TLS_SLOT     (0x1480 + _mi_win_tls_offset)  // User TLS slots <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+#define MI_TLS_SLOT     (0x888)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0x1678)            // Last TlsSlot (might clash with other app reserved slot)
 #endif
 
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
@@ -271,8 +269,8 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 
 // defined in `init.c`; do not use these directly
-extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 
@@ -400,7 +398,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 
 #elif defined(MI_TLS_PTHREAD)
 
-extern mi_decl_hidden pthread_key_t _mi_heap_default_key;
+extern pthread_key_t _mi_heap_default_key;
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a15d9cba..5a3f5fe2 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -571,6 +571,7 @@ struct mi_heap_s {
   size_t                guarded_size_min;                    // minimal size for guarded objects
   size_t                guarded_size_max;                    // maximal size for guarded objects
   size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_seed;                 // starting sample count
   size_t                guarded_sample_count;                // current sample count (counting down to 0)
   #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
@@ -632,6 +633,7 @@ struct mi_tld_s {
 };
 
 
+
 // ------------------------------------------------------
 // Debug
 // ------------------------------------------------------
@@ -646,6 +648,26 @@ struct mi_tld_s {
 #define MI_DEBUG_PADDING    (0xDE)
 #endif
 
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
 
 // ------------------------------------------------------
 // Statistics
diff --git a/readme.md b/readme.md
index 71aaf7a2..76e2711e 100644
--- a/readme.md
+++ b/readme.md
@@ -12,9 +12,9 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release   : `v3.1.4` (beta) (2025-06-09).  
-Latest v2 release: `v2.2.4` (2025-06-09).  
-Latest v1 release: `v1.9.4` (2024-06-09).
+Latest release   : `v3.0.3` (beta) (2025-03-28).  
+Latest v2 release: `v2.2.3` (2025-03-28).  
+Latest v1 release: `v1.9.3` (2024-03-28).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -73,22 +73,18 @@ Enjoy!
 ### Branches
 
 * `main`: latest stable release (still based on `dev2`).
-* `dev`:  development branch for mimalloc v1. Use this branch for submitting PR's.
+* `dev`:  development branch for mimalloc v1. **Use this branch for submitting PR's**.
 * `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` 
           (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
           mimalloc pages that can reduce fragmentation.
-* `dev3`: development branch for mimalloc v3 beta. This branch is downstream of `dev`. This version 
-          simplifies the lock-free ownership of previous versions, and improves sharing of memory between 
-          threads. On certain large workloads this version may use (much) less memory.
+* `dev3`: development branch for mimalloc v3-beta. This branch is also downstream of `dev`. This version 
+          simplifies the lock-free ownership of previous versions, has no thread-local segments any more. 
+          This improves sharing of memory between threads, and on certain large workloads may use (much) less memory.
 
 ### Releases
 
-* 2025-06-09, `v1.9.4`, `v2.2.4`, `v3.1.4` (beta) : Some important bug fixes, including a case where OS memory
-  was not always fully released. Improved v3 performance, build on XBox, fix build on Android, support interpose 
-  for older macOS versions, use MADV_FREE_REUSABLE on macOS, always check commit success, better support for Windows 
-  fixed TLS offset, etc.
-* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta) : Various small bug and build fixes, including:
-  fix arm32 pre v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
+* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta): Various small bug and build fixes, including:
+  fix arm32 pre-v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
   fix execution on non BMI1 x64 systems. 
 * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
   Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
@@ -180,7 +176,7 @@ mimalloc is used in various large scale low-latency services and programs, for e
 
 Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
 The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
-`mimalloc-override-dll` project builds a DLL for overriding malloc
+`mimalloc-override-dll` project builds DLL for overriding malloc
 in the entire program.
 
 ## Linux, macOS, BSD, etc.
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 52ab69c5..b5109ded 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -71,20 +71,24 @@ typedef void* mi_nothrow_t;
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
 
-  #define MI_INTERPOSE_DECLS(name)        __attribute__((used)) static struct mi_interpose_s name[]  __attribute__((section("__DATA, __interpose")))
-
-  MI_INTERPOSE_DECLS(_mi_interposes) =
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
+    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
+    MI_INTERPOSE_MI(strndup),
+    #endif
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
     MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
     MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    MI_INTERPOSE_MI(aligned_alloc),
+    #endif
     #ifdef MI_OSX_ZONE
     // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
     MI_INTERPOSE_MI(free),
@@ -95,12 +99,6 @@ typedef void* mi_nothrow_t;
     MI_INTERPOSE_FUN(vfree,mi_cfree),
     #endif
   };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_7) __OSX_AVAILABLE(10.7) = {
-    MI_INTERPOSE_MI(strndup),
-  };
-  MI_INTERPOSE_DECLS(_mi_interposes_10_15) __OSX_AVAILABLE(10.15) = {
-    MI_INTERPOSE_MI(aligned_alloc),
-  };
 
   #ifdef __cplusplus
   extern "C" {
diff --git a/src/arena.c b/src/arena.c
index e97ca885..01102c27 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -44,7 +44,7 @@ typedef struct mi_arena_s {
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be purged from `blocks_purge`.
-
+  
   mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
   mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
   mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
@@ -192,9 +192,14 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
   if (p != NULL) return p;
 
   // or fall back to the OS
-  p = _mi_os_zalloc(size, memid);
+  p = _mi_os_alloc(size, memid);
   if (p == NULL) return NULL;
 
+  // zero the OS memory if needed
+  if (!memid->initially_zero) {
+    _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
+  }
   return p;
 }
 
@@ -265,12 +270,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   else if (commit) {
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
     memid->initially_committed = true;
-    const size_t commit_size = mi_arena_block_size(needed_bcount);      
     bool any_uncommitted;
     size_t already_committed = 0;
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted, &already_committed);
     if (any_uncommitted) {
       mi_assert_internal(already_committed < needed_bcount);
+      const size_t commit_size = mi_arena_block_size(needed_bcount);
       const size_t stat_commit_size = commit_size - mi_arena_block_size(already_committed);
       bool commit_zero = false;
       if (!_mi_os_commit_ex(p, commit_size, &commit_zero, stat_commit_size)) {
@@ -280,10 +285,6 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
         if (commit_zero) { memid->initially_zero = true; }
       }
     }
-    else {
-      // all are already committed: signal that we are reusing memory in case it was purged before
-      _mi_os_reuse( p, commit_size );
-    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -368,7 +369,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id)
 {
   if (_mi_preloading()) return false;  // use OS only while pre loading
-
+  
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -410,7 +411,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0)
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) 
     {
       void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
@@ -490,7 +491,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks)
     // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory).
     mi_assert_internal(already_committed < blocks);
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, mi_arena_block_size(already_committed));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, mi_arena_block_size(already_committed));    
   }
 
   // clear the purged blocks
@@ -559,7 +560,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
   // check pre-conditions
   if (arena->memid.is_pinned) return false;
-
+   
   // expired yet?
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
   if (!force && (expire == 0 || expire > now)) return false;
@@ -614,7 +615,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   return any_purged;
 }
 
-static void mi_arenas_try_purge( bool force, bool visit_all )
+static void mi_arenas_try_purge( bool force, bool visit_all ) 
 {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
@@ -631,7 +632,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all )
   mi_atomic_guard(&purge_guard)
   {
     // increase global expire: at most one purge per delay cycle
-    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());
+    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());  
     size_t max_purge_count = (visit_all ? max_arena : 2);
     bool all_visited = true;
     for (size_t i = 0; i < max_arena; i++) {
@@ -950,7 +951,7 @@ void mi_debug_show_arenas(void) mi_attr_noexcept {
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
-    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, (size_t)(MI_ARENA_BLOCK_SIZE / MI_MiB), arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
       inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
     }
@@ -1010,17 +1011,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
   if (pages == 0) return 0;
 
   // pages per numa node
-  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
-  if (numa_count == 0) numa_count = 1;
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
   const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
 
   // reserve evenly among numa nodes
-  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
-    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
diff --git a/src/free.c b/src/free.c
index 5e5ae443..3b906738 100644
--- a/src/free.c
+++ b/src/free.c
@@ -348,10 +348,7 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 
 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
   MI_UNUSED_RELEASE(size);
-  #if MI_DEBUG
-  const size_t available = _mi_usable_size(p,"mi_free_size");
-  mi_assert(p == NULL || size <= available || available == 0 /* invalid pointer */ );
-  #endif
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
   mi_free(p);
 }
 
diff --git a/src/heap.c b/src/heap.c
index f96e60d0..cbfee560 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -176,7 +176,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 
   // merge statistics
-  if (collect <= MI_FORCE) { _mi_stats_merge_thread(heap->tld); }
+  if (collect <= MI_FORCE) {
+    mi_stats_merge();
+  }
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 3fc8b033..ddded152 100644
--- a/src/init.c
+++ b/src/init.c
@@ -123,7 +123,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   false,            // can reclaim
   0,                // tag
   #if MI_GUARDED
-  0, 0, 0, 1,       // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
@@ -172,7 +172,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   false,            // can reclaim
   0,                // tag
   #if MI_GUARDED
-  0, 0, 0, 0,
+  0, 0, 0, 0, 0,
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
@@ -184,14 +184,15 @@ mi_stats_t _mi_stats_main = { MI_STAT_VERSION, MI_STATS_NULL };
 
 #if MI_GUARDED
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
-  heap->guarded_sample_rate  = sample_rate;
-  heap->guarded_sample_count = sample_rate;  // count down samples
-  if (heap->guarded_sample_rate > 1) {
-    if (seed == 0) {
-      seed = _mi_heap_random_next(heap);
-    }
-    heap->guarded_sample_count = (seed % heap->guarded_sample_rate) + 1;  // start at random count between 1 and `sample_rate`
+  heap->guarded_sample_seed = seed;
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
   }
+  heap->guarded_sample_rate  = sample_rate;
+  if (heap->guarded_sample_rate >= 1) {
+    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
+  }
+  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
 }
 
 mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
@@ -244,6 +245,7 @@ mi_heap_t* _mi_heap_main_get(void) {
   return &_mi_heap_main;
 }
 
+
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
@@ -317,6 +319,7 @@ static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
 
 static mi_thread_data_t* mi_thread_data_zalloc(void) {
   // try to find thread metadata in the cache
+  bool is_zero = false;
   mi_thread_data_t* td = NULL;
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
     td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
@@ -324,25 +327,32 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
       // found cached allocation, try use it
       td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
-        _mi_memzero(td, offsetof(mi_thread_data_t,memid));
-        return td;
+        break;
       }
     }
   }
 
   // if that fails, allocate as meta data
-  mi_memid_t memid;
-  td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
   if (td == NULL) {
-    // if this fails, try once more. (issue #257)
-    td = (mi_thread_data_t*)_mi_os_zalloc(sizeof(mi_thread_data_t), &memid);
+    mi_memid_t memid;
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
     if (td == NULL) {
-      // really out of memory
-      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      return NULL;
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      }
+    }
+    if (td != NULL) {
+      td->memid = memid;
+      is_zero = memid.initially_zero;
     }
   }
-  td->memid = memid;
+
+  if (td != NULL && !is_zero) {
+    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
+  }
   return td;
 }
 
@@ -577,7 +587,7 @@ mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
 }
 
 // Called once by the process loader from `src/prim/prim.c`
-void _mi_auto_process_init(void) {
+void _mi_process_load(void) {
   mi_heap_main_init();
   #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
@@ -664,8 +674,8 @@ void mi_process_init(void) mi_attr_noexcept {
   }
 }
 
-// Called when the process is done (cdecl as it is used with `at_exit` on some platforms)
-void mi_cdecl mi_process_done(void) mi_attr_noexcept {
+// Called when the process is done (through `at_exit`)
+void mi_cdecl _mi_process_done(void) {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -708,7 +718,3 @@ void mi_cdecl mi_process_done(void) mi_attr_noexcept {
   os_preloading = true; // don't call the C runtime anymore
 }
 
-void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept {
-  if (_mi_option_get_fast(mi_option_destroy_on_exit)>1) return;
-  mi_process_done();
-}
diff --git a/src/options.c b/src/options.c
index af2a0e70..4759e0b0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -425,14 +425,14 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 }
 
 static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return false;
   #endif
   return mi_recurse_enter_prim();
 }
 
 static void mi_recurse_exit(void) {
-  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return;
   #endif
   mi_recurse_exit_prim();
@@ -525,7 +525,7 @@ void _mi_warning_message(const char* fmt, ...) {
 
 
 #if MI_DEBUG
-mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) mi_attr_noexcept {
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
   _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
   abort();
 }
diff --git a/src/os.c b/src/os.c
index 9b1b4b46..2472b803 100644
--- a/src/os.c
+++ b/src/os.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -152,8 +152,8 @@ static void mi_os_free_huge_os_pages(void* p, size_t size);
 
 static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
   mi_assert_internal((size % _mi_os_page_size()) == 0);
-  if (addr == NULL) return; // || _mi_os_is_huge_reserved(addr)
-  int err = _mi_prim_free(addr, size);  // allow size==0 (issue #1041)
+  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
@@ -166,16 +166,15 @@ static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
   if (mi_memkind_is_os(memid.memkind)) {
     size_t csize = memid.mem.os.size;
-    if (csize==0) { csize = _mi_os_good_alloc_size(size); }
-    mi_assert_internal(csize >= size);
+    if (csize==0) { _mi_os_good_alloc_size(size); }
     size_t commit_size = (still_committed ? csize : 0);
     void* base = addr;
     // different base? (due to alignment)
     if (memid.mem.os.base != base) {
-      mi_assert(memid.mem.os.base <= addr);
+      mi_assert(memid.mem.os.base <= addr);      
       base = memid.mem.os.base;
       const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
-      if (memid.mem.os.size==0) {
+      if (memid.mem.os.size==0) { 
         csize += diff;
       }
       if (still_committed) {
@@ -286,10 +285,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 
       // explicitly commit only the aligned part
       if (commit) {
-        if (!_mi_os_commit(p, size, NULL)) {
-          mi_os_prim_free(*base, over_size, 0);
-          return NULL;
-        }
+        _mi_os_commit(p, size, NULL);
       }
     }
     else  { // mmap can free inside an allocation
@@ -327,11 +323,9 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);  
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(memid->initially_committed);
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+  }
   return p;
 }
 
@@ -347,42 +341,15 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   bool os_is_zero  = false;
   void* os_base = NULL;
   void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
-  if (p == NULL) return NULL;
-
-  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
-  memid->mem.os.base = os_base;
-  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
-
-  mi_assert_internal(memid->mem.os.size >= size);
-  mi_assert_internal(_mi_is_aligned(p,alignment));
-  if (commit) { mi_assert_internal(memid->initially_committed); }  
-  return p;
-}
-
-
-mi_decl_nodiscard static void* mi_os_ensure_zero(void* p, size_t size, mi_memid_t* memid) {
-  if (p==NULL || size==0) return p;
-  // ensure committed
-  if (!memid->initially_committed) {
-    bool is_zero = false;
-    if (!_mi_os_commit(p, size, &is_zero)) {
-      _mi_os_free(p, size, *memid);
-      return NULL;
-    }
-    memid->initially_committed = true;
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    memid->mem.os.base = os_base;
+    // memid->mem.os.alignment = alignment;
+    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
-  // ensure zero'd
-  if (memid->initially_zero) return p;
-  _mi_memzero_aligned(p,size);
-  memid->initially_zero = true;
   return p;
 }
 
-void*  _mi_os_zalloc(size_t size, mi_memid_t* memid) {
-  void* p = _mi_os_alloc(size,memid);
-  return mi_os_ensure_zero(p, size, memid);
-}
-
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
   for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@@ -528,17 +495,6 @@ bool _mi_os_reset(void* addr, size_t size) {
 }
 
 
-void _mi_os_reuse( void* addr, size_t size ) {
-  // page align conservatively within the range
-  size_t csize = 0;
-  void* const start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return;
-  const int err = _mi_prim_reuse(start, csize);
-  if (err != 0) {
-    _mi_warning_message("cannot reuse OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
-  }
-}
-
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
@@ -646,7 +602,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
-  uint8_t* const start = mi_os_claim_huge_pages(pages, &size);
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
   if (start == NULL) return NULL; // or 32-bit systems
 
   // Allocate one page at the time but try to place them contiguously
@@ -702,7 +658,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
   if (page != 0) {
     mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(start, size, true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
     memid->memkind = MI_MEM_OS_HUGE;
     mi_assert(memid->is_pinned);
     #ifdef MI_TRACK_ASAN
@@ -724,47 +680,34 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) {
   }
 }
 
-
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 
-static _Atomic(size_t) mi_numa_node_count; // = 0   // cache the node count
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
 
-int _mi_os_numa_node_count(void) {
-  size_t count = mi_atomic_load_acquire(&mi_numa_node_count);
-  if mi_unlikely(count == 0) {
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
     long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0 && ncount < INT_MAX) {
+    if (ncount > 0) {
       count = (size_t)ncount;
     }
     else {
-      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
-      if (n == 0 || n > INT_MAX) { count = 1; }
-                            else { count = n; }
+      count = _mi_prim_numa_node_count(); // or detect dynamically
+      if (count == 0) count = 1;
     }
-    mi_atomic_store_release(&mi_numa_node_count, count); // save it
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
     _mi_verbose_message("using %zd numa regions\n", count);
   }
-  mi_assert_internal(count > 0 && count <= INT_MAX);
-  return (int)count;
+  return count;
 }
 
-static int mi_os_numa_node_get(void) {
-  int numa_count = _mi_os_numa_node_count();
+int _mi_os_numa_node_get(void) {
+  size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  const size_t n = _mi_prim_numa_node();
-  int numa_node = (n < INT_MAX ? (int)n : 0);
+  size_t numa_node = _mi_prim_numa_node();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return numa_node;
-}
-
-int _mi_os_numa_node(void) {
-  if mi_likely(mi_atomic_load_relaxed(&mi_numa_node_count) == 1) {
-    return 0;
-  }
-  else {
-    return mi_os_numa_node_get();
-  }
+  return (int)numa_node;
 }
diff --git a/src/page-queue.c b/src/page-queue.c
index c719b626..469e57d5 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -140,7 +140,7 @@ static inline bool mi_page_is_large_or_huge(const mi_page_t* page) {
   return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page));
 }
 
-size_t _mi_page_bin(const mi_page_t* page) {
+static size_t mi_page_bin(const mi_page_t* page) {
   const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
   mi_assert_internal(bin <= MI_BIN_FULL);
   return bin;
@@ -148,7 +148,7 @@ size_t _mi_page_bin(const mi_page_t* page) {
 
 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
   mi_assert_internal(heap!=NULL);
-  const size_t bin = _mi_page_bin(page);
+  const size_t bin = mi_page_bin(page);
   mi_page_queue_t* pq = &heap->pages[bin];
   mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
                        (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) ||
diff --git a/src/page.c b/src/page.c
index a5a10503..8db2463f 100644
--- a/src/page.c
+++ b/src/page.c
@@ -37,7 +37,7 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
 }
 
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
@@ -112,7 +112,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
-extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
@@ -291,7 +291,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(full_block_size >= block_size);
   mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
-  mi_heap_stat_increase(heap, page_bins[_mi_page_bin(page)], 1);
+  mi_heap_stat_increase(heap, page_bins[mi_page_bin(page)], 1);
   if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
@@ -445,7 +445,8 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_segments_tld_t* segments_tld = &heap->tld->segments;
   mi_page_queue_remove(pq, page);
 
-  // and free it  
+  // and free it
+  mi_heap_stat_decrease(heap, page_bins[mi_page_bin(page)], 1);
   mi_page_set_heap(page,NULL);
   _mi_segment_page_free(page, force, segments_tld);
 }
@@ -632,14 +633,15 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+  MI_UNUSED(tld);
   mi_assert_expensive(mi_page_is_valid_init(page));
   #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  if (page->free != NULL) return true;
+  if (page->free != NULL) return;
   #endif
-  if (page->capacity >= page->reserved) return true;
+  if (page->capacity >= page->reserved) return;
 
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
@@ -672,7 +674,6 @@ static bool mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   page->capacity += (uint16_t)extend;
   mi_stat_increase(tld->stats.page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
-  return true;
 }
 
 // Initialize a fresh page
@@ -727,10 +728,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  if (mi_page_extend_free(heap,page,tld)) {
-    mi_assert(mi_page_immediate_available(page));
-  }
-  return;
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
 }
 
 
@@ -822,14 +821,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   if (page_candidate != NULL) {
     page = page_candidate;
   }
-  if (page != NULL) {
-    if (!mi_page_immediate_available(page)) {
-      mi_assert_internal(mi_page_is_expandable(page));
-      if (!mi_page_extend_free(heap, page, heap->tld)) {
-        page = NULL; // failed to extend
-      }
-    }
-    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  if (page != NULL && !mi_page_immediate_available(page)) {
+    mi_assert_internal(mi_page_is_expandable(page));
+    mi_page_extend_free(heap, page, heap->tld);
   }
 
   if (page == NULL) {
@@ -1004,9 +998,9 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
     // free delayed frees from other threads (but skip contended ones)
     _mi_heap_delayed_free_partial(heap);
-
+    
     // collect every once in a while (10000 by default)
-    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);    
     if (heap->generic_collect_count >= generic_collect) {
       heap->generic_collect_count = 0;
       mi_heap_collect(heap, false /* force? */);
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index c4cfc35d..82147de7 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen, Alon Zakai
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -58,7 +58,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config) {
 extern void emmalloc_free(void*);
 
 int _mi_prim_free(void* addr, size_t size) {
-  if (size==0) return 0;
+  MI_UNUSED(size);
   emmalloc_free(addr);
   return 0;
 }
@@ -114,11 +114,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
diff --git a/src/prim/prim.c b/src/prim/prim.c
index 5147bae8..2002853f 100644
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@@ -39,29 +39,29 @@ terms of the MIT license. A copy of the license can be found in the file
     #define mi_attr_destructor  __attribute__((destructor))
   #endif
   static void mi_attr_constructor mi_process_attach(void) {
-    _mi_auto_process_init();
+    _mi_process_load();
   }
   static void mi_attr_destructor mi_process_detach(void) {
-    _mi_auto_process_done();
+    _mi_process_done();
   }
 #elif defined(__cplusplus)
   // C++: use static initialization to detect process start/end
   // This is not guaranteed to be first/last but the best we can generally do?
   struct mi_init_done_t {
     mi_init_done_t() {
-      _mi_auto_process_init();
+      _mi_process_load();
     }
     ~mi_init_done_t() {
-      _mi_auto_process_done();
+      _mi_process_done();
     }
   };
   static mi_init_done_t mi_init_done;
  #else
-  #pragma message("define a way to call _mi_auto_process_init/done on your platform")
+  #pragma message("define a way to call _mi_process_load/done on your platform")
 #endif
 #endif
 
-// Generic allocator init/done callback
+// Generic allocator init/done callback 
 #ifndef MI_PRIM_HAS_ALLOCATOR_INIT
 bool _mi_is_redirected(void) {
   return false;
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 650aa657..ad6ca2a9 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -31,10 +31,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #if defined(__linux__)
   #include <features.h>
-  #include <sys/prctl.h>    // THP disable, PR_SET_VMA
-  #if defined(__GLIBC__) && !defined(PR_SET_VMA)
-  #include <linux/prctl.h>
-  #endif
+  #include <linux/prctl.h>  // PR_SET_VMA
+  //#if defined(MI_NO_THP)
+  #include <sys/prctl.h>    // THP disable
+  //#endif
   #if defined(__GLIBC__)
   #include <linux/mman.h>   // linux mmap flags
   #else
@@ -70,8 +70,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MADV_FREE  POSIX_MADV_FREE
 #endif
 
-#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
-
+  
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is
@@ -157,7 +156,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     }
     #endif
   }
-  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
+  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
   config->has_overcommit = unix_detect_overcommit();
   config->has_partial_free = true;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
@@ -187,7 +186,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
 //---------------------------------------------
 
 int _mi_prim_free(void* addr, size_t size ) {
-  if (size==0) return 0;
   bool err = (munmap(addr, size) == -1);
   return (err ? errno : 0);
 }
@@ -210,7 +208,7 @@ static int unix_madvise(void* addr, size_t size, int advice) {
 
 static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
   void* p = mmap(addr, size, protect_flags, flags, fd, 0 /* offset */);
-  #if defined(__linux__) && defined(PR_SET_VMA)
+  #if (defined(__linux__) && defined(PR_SET_VMA))
   if (p!=MAP_FAILED && p!=NULL) {
     prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, size, "mimalloc");
   }
@@ -387,9 +385,6 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
-  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
-    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
-  }
 
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
@@ -429,25 +424,11 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   return err;
 }
 
-int _mi_prim_reuse(void* start, size_t size) {
-  MI_UNUSED(start); MI_UNUSED(size);
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSE)
-  return unix_madvise(start, size, MADV_FREE_REUSE);
-  #endif
-  return 0;
-}
-
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   int err = 0;
-  #if defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
-    // decommit on macOS: use MADV_FREE_REUSABLE as it does immediate rss accounting (issue #1097)
-    err = unix_madvise(start, size, MADV_FREE_REUSABLE);
-    if (err) { err = unix_madvise(start, size, MADV_DONTNEED); }
-  #else
-    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-    err = unix_madvise(start, size, MADV_DONTNEED);
-  #endif  
-  #if !MI_DEBUG && MI_SECURE<=2
+  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+  err = unix_madvise(start, size, MADV_DONTNEED);
+  #if !MI_DEBUG && !MI_SECURE
     *needs_recommit = false;
   #else
     *needs_recommit = true;
@@ -464,22 +445,14 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
 }
 
 int _mi_prim_reset(void* start, size_t size) {
-  int err = 0;
-
-  // on macOS can use MADV_FREE_REUSABLE (but we disable this for now as it seems slower)
-  #if 0 && defined(__APPLE__) && defined(MADV_FREE_REUSABLE) 
-  err = unix_madvise(start, size, MADV_FREE_REUSABLE);  
-  if (err==0) return 0;
-  // fall through
-  #endif
-
-  #if defined(MADV_FREE)
-  // Otherwise, we try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
   // will not reduce the `rss` stats in tools like `top` even though the memory is available
   // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
   // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
   while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
   if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
@@ -487,7 +460,7 @@ int _mi_prim_reset(void* start, size_t size) {
     err = unix_madvise(start, size, MADV_DONTNEED);
   }
   #else
-  err = unix_madvise(start, size, MADV_DONTNEED);
+  int err = unix_madvise(start, size, MADV_DONTNEED);
   #endif
   return err;
 }
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index 745a41fd..e1e7de5e 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -149,11 +149,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index eebdc4a6..a080f4bc 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -12,10 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr
 
-// xbox has no console IO
-#if !defined(WINAPI_FAMILY_PARTITION) || WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
-#define MI_HAS_CONSOLE_IO
-#endif
 
 //---------------------------------------------
 // Dynamically bind Windows API points for portability
@@ -49,30 +45,22 @@ typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
 #define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
 
 #include <winternl.h>
-typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef LONG  (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);  // avoid NTSTATUS as it is not defined on xbox (pr #1084)
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7  (and GetNumaNodeProcessorMask is not supported on xbox)
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
 
 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
 typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
 typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMask)(UCHAR Node, PULONGLONG ProcessorMask);
-typedef BOOL (__stdcall* PGetNumaHighestNodeNumber)(PULONG Node);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
 static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
-static PGetNumaNodeProcessorMask    pGetNumaNodeProcessorMask = NULL;
-static PGetNumaHighestNodeNumber    pGetNumaHighestNodeNumber = NULL;
-
-// Not available on xbox
-typedef SIZE_T(__stdcall* PGetLargePageMinimum)(VOID);
-static PGetLargePageMinimum pGetLargePageMinimum = NULL;
 
 // Available after Windows XP
 typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes );
@@ -86,7 +74,6 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
   static bool large_initialized = false;
   if (large_initialized) return (_mi_os_large_page_size() > 0);
   large_initialized = true;
-  if (pGetLargePageMinimum==NULL) return false;  // no large page support (xbox etc.)
 
   // Try to see if large OS pages are supported
   // To use large pages on Windows, we first need access permission
@@ -105,8 +92,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
       if (ok) {
         err = GetLastError();
         ok = (err == ERROR_SUCCESS);
-        if (ok && large_page_size != NULL && pGetLargePageMinimum != NULL) {
-          *large_page_size = (*pGetLargePageMinimum)();
+        if (ok && large_page_size != NULL) {
+          *large_page_size = GetLargePageMinimum();
         }
       }
     }
@@ -162,9 +149,6 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
     pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
     pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
-    pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMask");
-    pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)(void (*)(void))GetProcAddress(hDll, "GetNumaHighestNodeNumber");
-    pGetLargePageMinimum = (PGetLargePageMinimum)(void (*)(void))GetProcAddress(hDll, "GetLargePageMinimum");
     // Get physical memory (not available on XP, so check dynamically)
     PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory");
     if (pGetPhysicallyInstalledSystemMemory != NULL) {
@@ -368,11 +352,6 @@ int _mi_prim_reset(void* addr, size_t size) {
   return (p != NULL ? 0 : (int)GetLastError());
 }
 
-int _mi_prim_reuse(void* addr, size_t size) {
-  MI_UNUSED(addr); MI_UNUSED(size);
-  return 0;
-}
-
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   DWORD oldprotect = 0;
   BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
@@ -404,7 +383,7 @@ static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int num
     }
     SIZE_T psize = size;
     void* base = hint_addr;
-    LONG err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
     if (err == 0 && base != NULL) {
       return base;
     }
@@ -458,11 +437,9 @@ size_t _mi_prim_numa_node(void) {
 
 size_t _mi_prim_numa_node_count(void) {
   ULONG numa_max = 0;
-  if (pGetNumaHighestNodeNumber!=NULL) {
-    (*pGetNumaHighestNodeNumber)(&numa_max);
-  }
+  GetNumaHighestNodeNumber(&numa_max);
   // find the highest node number that has actual processors assigned to it. Issue #282
-  while (numa_max > 0) {
+  while(numa_max > 0) {
     if (pGetNumaNodeProcessorMaskEx != NULL) {
       // Extended API is supported
       GROUP_AFFINITY affinity;
@@ -473,10 +450,8 @@ size_t _mi_prim_numa_node_count(void) {
     else {
       // Vista or earlier, use older API that is limited to 64 processors.
       ULONGLONG mask;
-      if (pGetNumaNodeProcessorMask != NULL) {
-        if ((*pGetNumaNodeProcessorMask)((UCHAR)numa_max, &mask)) {
-          if (mask != 0) break; // found the maximum non-empty node
-        }
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
       };
     }
     // max node was invalid or had no processor assigned, try again
@@ -566,21 +541,17 @@ void _mi_prim_out_stderr( const char* msg )
   if (!_mi_preloading()) {
     // _cputs(msg);  // _cputs cannot be used as it aborts when failing to lock the console
     static HANDLE hcon = INVALID_HANDLE_VALUE;
-    static bool hconIsConsole = false;
+    static bool hconIsConsole;
     if (hcon == INVALID_HANDLE_VALUE) {
-      hcon = GetStdHandle(STD_ERROR_HANDLE);
-      #ifdef MI_HAS_CONSOLE_IO
       CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
       hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
-      #endif  
     }
     const size_t len = _mi_strlen(msg);
     if (len > 0 && len < UINT32_MAX) {
       DWORD written = 0;
       if (hconIsConsole) {
-        #ifdef MI_HAS_CONSOLE_IO
         WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
-        #endif      
       }
       else if (hcon != INVALID_HANDLE_VALUE) {
         // use direct write if stderr was redirected
@@ -656,47 +627,19 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Process & Thread Init/Done
 //----------------------------------------------------------------
 
-#if MI_WIN_USE_FIXED_TLS==1
-mi_decl_cache_align size_t _mi_win_tls_offset = 0;
-#endif
-
-//static void mi_debug_out(const char* s) {
-//  HANDLE h = GetStdHandle(STD_ERROR_HANDLE);
-//  WriteConsole(h, s, (DWORD)_mi_strlen(s), NULL, NULL);
-//}
-
-static void mi_win_tls_init(DWORD reason) {
-  if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) {
-    #if MI_WIN_USE_FIXED_TLS==1  // we must allocate a TLS slot dynamically
-    if (_mi_win_tls_offset == 0 && reason == DLL_PROCESS_ATTACH) {
-      const DWORD tls_slot = TlsAlloc();  // usually returns slot 1
-      if (tls_slot == TLS_OUT_OF_INDEXES) {
-        _mi_error_message(EFAULT, "unable to allocate the a TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n");
-      }
-      _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*);
-    }
-    #endif
-    #if MI_HAS_TLS_SLOT >= 2  // we must initialize the TLS slot before any allocation
-    if (mi_prim_get_default_heap() == NULL) {
-      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
-      #if MI_DEBUG && MI_WIN_USE_FIXED_TLS==1
-      void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*)));
-      mi_assert_internal(p == (void*)&_mi_heap_empty);
-      #endif
-    }
-    #endif
-  }
-}
-
 static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   MI_UNUSED(reserved);
   MI_UNUSED(module);
-  mi_win_tls_init(reason);
+  #if MI_TLS_SLOT >= 2
+  if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+    _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+  }
+  #endif
   if (reason==DLL_PROCESS_ATTACH) {
-    _mi_auto_process_init();
+    _mi_process_load();
   }
   else if (reason==DLL_PROCESS_DETACH) {
-    _mi_auto_process_done();
+    _mi_process_done();
   }
   else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
     _mi_thread_done(NULL);
@@ -786,7 +729,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
 
     static int mi_process_attach(void) {
       mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
-      atexit(&_mi_auto_process_done);
+      atexit(&_mi_process_done);
       return 0;
     }
     typedef int(*mi_crt_callback_t)(void);
@@ -853,7 +796,11 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
   #endif
   mi_decl_export void _mi_redirect_entry(DWORD reason) {
     // called on redirection; careful as this may be called before DllMain
-    mi_win_tls_init(reason);
+    #if MI_TLS_SLOT >= 2
+    if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+    }
+    #endif
     if (reason == DLL_PROCESS_ATTACH) {
       mi_redirected = true;
     }
diff --git a/src/segment-map.c b/src/segment-map.c
index bbcea28a..2f68f8c4 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -61,7 +61,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo
   if mi_unlikely(part == NULL) {
     if (!create_on_demand) return NULL;
     mi_memid_t memid;
-    part = (mi_segmap_part_t*)_mi_os_zalloc(sizeof(mi_segmap_part_t), &memid);
+    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
     if (part == NULL) return NULL;
     part->memid = memid;
     mi_segmap_part_t* expected = NULL;
diff --git a/src/segment.c b/src/segment.c
index 32841e6d..29502bcd 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1023,7 +1023,6 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   size_t inuse = page->capacity * mi_page_block_size(page);
   _mi_stat_decrease(&tld->stats->page_committed, inuse);
   _mi_stat_decrease(&tld->stats->pages, 1);
-  _mi_stat_decrease(&tld->stats->page_bins[_mi_page_bin(page)], 1);
 
   // reset the page memory to reduce memory pressure?
   if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
diff --git a/src/stats.c b/src/stats.c
index 34b3d4e4..dec74f70 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -218,14 +218,12 @@ static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int
   _mi_fprintf(out, arg, "\n");
 }
 
-#if MI_STAT>1
 static void mi_stat_total_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
   _mi_fprintf(out, arg, "%10s:", msg);
   _mi_fprintf(out, arg, "%12s", " ");  // no peak
   mi_print_amount(stat->total, unit, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
-#endif
 
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
   _mi_fprintf(out, arg, "%10s:", msg);
@@ -350,7 +348,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->page_searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %5i\n", "numa nodes", _mi_os_numa_node_count());
+  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
 
   size_t elapsed;
   size_t user_time;
@@ -361,9 +359,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   size_t peak_commit;
   size_t page_faults;
   mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %5zu.%03zu s\n", "elapsed", elapsed/1000, elapsed%1000);
-  _mi_fprintf(out, arg, "%10s: user: %zu.%03zu s, system: %zu.%03zu s, faults: %zu, rss: ", "process",
-              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, page_faults );
+  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
   if (peak_commit > 0) {
     _mi_fprintf(out, arg, ", commit: ");
@@ -397,10 +395,6 @@ void mi_stats_merge(void) mi_attr_noexcept {
   mi_stats_merge_from( mi_stats_get_default() );
 }
 
-void _mi_stats_merge_thread(mi_tld_t* tld) {
-  mi_stats_merge_from( &tld->stats );
-}
-
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
   mi_stats_merge_from(stats);
 }
@@ -504,7 +498,7 @@ static bool mi_heap_buf_expand(mi_heap_buf_t* hbuf) {
     hbuf->buf[hbuf->size-1] = 0;
   }
   if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false;
-  const size_t newsize = (hbuf->size == 0 ? mi_good_size(12*MI_KiB) : 2*hbuf->size);
+  const size_t newsize = (hbuf->size == 0 ? 2*MI_KiB : 2*hbuf->size);
   char* const  newbuf  = (char*)mi_rezalloc(hbuf->buf, newsize);
   if (newbuf == NULL) return false;
   hbuf->buf = newbuf;
diff --git a/test/main-override-dep.cpp b/test/main-override-dep.cpp
index d89e3fca..e92f6fc4 100644
--- a/test/main-override-dep.cpp
+++ b/test/main-override-dep.cpp
@@ -1,7 +1,6 @@
 // Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. 
 // This is imported by the `mimalloc-test-override` project.
 #include <string>
-#include <iostream>
 #include "main-override-dep.h"
 
 std::string TestAllocInDll::GetString()
@@ -11,41 +10,6 @@ std::string TestAllocInDll::GetString()
 	const char* t = "test";
 	memcpy(test, t, 4);
 	std::string r = test;
-  std::cout << "override-dep: GetString: " << r << "\n";
 	delete[] test;
 	return r;
-}
-
-
-class Static {
-private:
-  void* p;
-public:
-  Static() {
-    printf("override-dep: static constructor\n");
-    p = malloc(64);
-    return;
-  }
-  ~Static() {
-    free(p);
-    printf("override-dep: static destructor\n");
-    return;
-  }
-};
-
-static Static s = Static();
-
-
-#include <windows.h>
-
-BOOL WINAPI DllMain(HINSTANCE module, DWORD reason, LPVOID reserved) {
-  (void)(reserved);
-  (void)(module);
-  if (reason==DLL_PROCESS_ATTACH) {
-    printf("override-dep: dll attach\n");
-  }
-  else if (reason==DLL_PROCESS_DETACH) {
-    printf("override-dep: dll detach\n");
-  }  
-  return TRUE;
-}
+}
\ No newline at end of file
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 420b8bf7..4190c962 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -43,7 +43,7 @@ int main() {
   // corrupt_free();
   // block_overflow1();
   // block_overflow2();
-  test_canary_leak();
+  // test_canary_leak();
   // test_aslr();
   // invalid_free();
   // test_reserved();
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 75b409fd..747af994 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -40,7 +40,7 @@ static void test_thread_local();      // issue #944
 static void test_mixed1();             // issue #942
 static void test_stl_allocators();
 
-#if _WIN32
+#if x_WIN32
 #include "main-override-dep.h"
 static void test_dep();               // issue #981: test overriding in another DLL
 #else
@@ -150,12 +150,11 @@ static bool test_stl_allocator1() {
 struct some_struct { int i; int j; double z; };
 
 
-#if _WIN32
+#if x_WIN32
 static void test_dep()
 {
   TestAllocInDll t;
   std::string s = t.GetString();
-  std::cout << "test_dep GetString: " << s << "\n";
 }
 #endif
 
diff --git a/test/test-stress.c b/test/test-stress.c
index 4f5a3d58..d65c7f2e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -343,7 +343,6 @@ int main(int argc, char** argv) {
     mi_free(json);
   }
   #endif
-  mi_collect(true);
   mi_stats_print(NULL);  
 #endif
   //bench_end_program();