diff --git a/CMakeLists.txt b/CMakeLists.txt
index 203da67b..ed0a46de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,15 +6,14 @@ set(CMAKE_CXX_STANDARD 17)
 
 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
 option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
-option(MI_SEE_ASM           "Generate assembly files" OFF)
-option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
+option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
+option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
-option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_USER_CLEANUP      "Enable mi_register_user_cleanup functionality" OFF)
-
-set(mi_install_dir "lib/mimalloc-${mi_version}")
+option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 
 set(mi_sources
     src/stats.c
@@ -29,29 +28,33 @@ set(mi_sources
     src/options.c
     src/init.c)
 
-# Set default build type
+# -----------------------------------------------------------------------------
+# Converience: set default build type depending on the build directory
+# -----------------------------------------------------------------------------
+
 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$")
-    message(STATUS "No build type selected, default to *** Debug ***")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL MATCHES "ON")
+    message(STATUS "No build type selected, default to: Debug")
     set(CMAKE_BUILD_TYPE "Debug")
   else()
-    message(STATUS "No build type selected, default to *** Release ***")
+    message(STATUS "No build type selected, default to: Release")
     set(CMAKE_BUILD_TYPE "Release")
   endif()
-else()
-  message(STATUS "Build type specified as *** ${CMAKE_BUILD_TYPE} ***")
 endif()
 
 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
+  message(STATUS "Default to secure build")
   set(MI_SECURE "ON")
 endif()
 
+# -----------------------------------------------------------------------------
+# Process options
+# -----------------------------------------------------------------------------
+
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
   set(MI_USE_CXX "ON")
 endif()
 
-
-# Options
 if(MI_OVERRIDE MATCHES "ON")
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
@@ -68,8 +71,8 @@ if(MI_OVERRIDE MATCHES "ON")
 endif()
 
 if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=2)
+  message(STATUS "Set full secure build (MI_SECURE=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
 endif()
 
 if(MI_USER_CLEANUP MATCHES "ON")
@@ -83,7 +86,12 @@ if(MI_SEE_ASM MATCHES "ON")
 endif()
 
 if(MI_CHECK_FULL MATCHES "ON")
-  message(STATUS "Set debug level to full invariant checking (MI_CHECK_FULL=ON)")
+  message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead")
+  set(MI_DEBUG_FULL "ON")
+endif()
+
+if(MI_DEBUG_FULL MATCHES "ON")
+  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
   list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
 endif()
 
@@ -108,19 +116,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
   endif()
 endif()
 
-if(NOT(CMAKE_BUILD_TYPE MATCHES "Release|release|RelWithDebInfo|relwithdebinfo"))
-  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type)
-  set(mi_basename "mimalloc-${build_type}")
-else()
-  if(MI_SECURE MATCHES "ON")
-    set(mi_basename "mimalloc-secure")
-  else()
-    set(mi_basename "mimalloc")
-  endif()
-endif()
-message(STATUS "Output library name   : ${mi_basename}")
-message(STATUS "Installation directory: ${mi_install_dir}")
-
 # extra needed libraries
 if(WIN32)
   list(APPEND mi_libraries psapi shell32 user32)
@@ -133,9 +128,28 @@ else()
 endif()
 
 # -----------------------------------------------------------------------------
-# Main targets
+# Install and output names
 # -----------------------------------------------------------------------------
 
+set(mi_install_dir "${CMAKE_INSTALL_PREFIX}/lib/mimalloc-${mi_version}")
+if(MI_SECURE MATCHES "ON")
+  set(mi_basename "mimalloc-secure")
+else()
+  set(mi_basename "mimalloc")
+endif()
+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel)$"))
+  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
+endif()
+message(STATUS "")
+message(STATUS "Library base name: ${mi_basename}")
+message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+message(STATUS "Install directory: ${mi_install_dir}")
+message(STATUS "")
+
+# -----------------------------------------------------------------------------
+# Main targets
+# -----------------------------------------------------------------------------
 
 # shared library
 add_library(mimalloc SHARED ${mi_sources})
@@ -237,7 +251,7 @@ endif()
 if (MI_OVERRIDE MATCHES "ON")
   target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
   if(NOT WIN32)
-    # It is only possible to override malloc on Windows when building as a DLL. (src/alloc-override.c)
+    # It is only possible to override malloc on Windows when building as a DLL.
     target_compile_definitions(mimalloc-static PRIVATE MI_MALLOC_OVERRIDE)
     target_compile_definitions(mimalloc-obj PRIVATE MI_MALLOC_OVERRIDE)
   endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 79228c41..41d67f86 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -35,22 +35,32 @@ jobs:
         CC: gcc
         CXX: g++
         BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
       Release:
         CC: gcc
         CXX: g++
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure:
+        CC: gcc
+        CXX: g++
+        BuildType: secure
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
       Debug Clang:
         CC: clang
         CXX: clang++
         BuildType: debug-clang
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
       Release Clang:
         CC: clang
         CXX: clang++
         BuildType: release-clang
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: secure-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
 
   steps:
   - task: CMake@1
diff --git a/bin/mimalloc-redirect.dll b/bin/mimalloc-redirect.dll
index 45e0fb48..b7bf1d09 100644
Binary files a/bin/mimalloc-redirect.dll and b/bin/mimalloc-redirect.dll differ
diff --git a/bin/mimalloc-redirect.lib b/bin/mimalloc-redirect.lib
index 149dabb7..550db8ec 100644
Binary files a/bin/mimalloc-redirect.lib and b/bin/mimalloc-redirect.lib differ
diff --git a/bin/mimalloc-redirect32.dll b/bin/mimalloc-redirect32.dll
index 0b5f5156..7ba303af 100644
Binary files a/bin/mimalloc-redirect32.dll and b/bin/mimalloc-redirect32.dll differ
diff --git a/bin/mimalloc-redirect32.lib b/bin/mimalloc-redirect32.lib
index 2bc4b0bc..66173060 100644
Binary files a/bin/mimalloc-redirect32.lib and b/bin/mimalloc-redirect32.lib differ
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 03316948..9d78b5a0 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,5 +1,5 @@
 set(mi_version_major 1)
-set(mi_version_minor 1)
+set(mi_version_minor 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/ide/vs2019/mimalloc-override.vcxproj.filters b/ide/vs2019/mimalloc-override.vcxproj.filters
new file mode 100644
index 00000000..bc1e4c60
--- /dev/null
+++ b/ide/vs2019/mimalloc-override.vcxproj.filters
@@ -0,0 +1,72 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Header Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\memory.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{f1fccf27-17b9-42dd-ba51-6070baff85c6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{39cb7e38-69d0-43fb-8406-6a0f7cefc3b4}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/ide/vs2019/mimalloc-test-stress.vcxproj b/ide/vs2019/mimalloc-test-stress.vcxproj
index afbb6666..ef7ab357 100644
--- a/ide/vs2019/mimalloc-test-stress.vcxproj
+++ b/ide/vs2019/mimalloc-test-stress.vcxproj
@@ -149,8 +149,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 5658b536..28e96d71 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -111,12 +111,12 @@
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
+      <WarningLevel>Level2</WarningLevel>
       <Optimization>Disabled</Optimization>
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp17</LanguageStandard>
diff --git a/ide/vs2019/mimalloc.vcxproj.filters b/ide/vs2019/mimalloc.vcxproj.filters
new file mode 100644
index 00000000..b2282df3
--- /dev/null
+++ b/ide/vs2019/mimalloc.vcxproj.filters
@@ -0,0 +1,75 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override-osx.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\memory.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{2b556b10-f559-4b2d-896e-142652adbf0c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{852a14ae-6dde-4e95-8077-ca705e97e5af}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 8b254d3e..10368df3 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -130,7 +130,7 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
   return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   return mi_atomic_cas_strong(p,desired,expected);
@@ -220,7 +220,7 @@ static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x)
 #endif
 #elif defined(__wasi__)
   #include <sched.h>
-  static inline void mi_atomic_yield() {
+  static inline void mi_atomic_yield(void) {
     sched_yield();
   }
 #else
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index e99e6df6..452f0b68 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -20,6 +20,18 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_trace_message(...)  
 #endif
 
+#if defined(_MSC_VER)
+#define mi_decl_noinline   __declspec(noinline)
+#define mi_attr_noreturn 
+#elif defined(__GNUC__) || defined(__clang__)
+#define mi_decl_noinline   __attribute__((noinline))
+#define mi_attr_noreturn   __attribute__((noreturn))
+#else
+#define mi_decl_noinline
+#define mi_attr_noreturn   
+#endif
+
+
 // "options.c"
 void       _mi_fputs(mi_output_fun* out, const char* prefix, const char* message);
 void       _mi_fprintf(mi_output_fun* out, const char* fmt, ...);
@@ -28,12 +40,12 @@ void       _mi_warning_message(const char* fmt, ...);
 void       _mi_verbose_message(const char* fmt, ...);
 void       _mi_trace_message(const char* fmt, ...);
 void       _mi_options_init(void);
+void       _mi_fatal_error(const char* fmt, ...) mi_attr_noreturn;
 
 // "init.c"
 extern mi_stats_t       _mi_stats_main;
 extern const mi_page_t  _mi_page_empty;
 bool       _mi_is_main_thread(void);
-uintptr_t  _mi_ptr_cookie(const void* p);
 uintptr_t  _mi_random_shuffle(uintptr_t x);
 uintptr_t  _mi_random_init(uintptr_t seed /* can be zero */);
 bool       _mi_preloading();  // true while the C runtime is not ready
@@ -89,6 +101,7 @@ uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD i
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 uintptr_t  _mi_heap_random(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
@@ -124,13 +137,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #define __has_builtin(x)  0
 #endif
 
-#if defined(_MSC_VER)
-#define mi_decl_noinline   __declspec(noinline)
-#elif defined(__GNUC__) || defined(__clang__)
-#define mi_decl_noinline   __attribute__((noinline))
-#else
-#define mi_decl_noinline
-#endif
 
 
 /* -----------------------------------------------------------
@@ -156,10 +162,13 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
-#if (MI_INTPTR_SIZE == 4)
+#include <limits.h>   // UINT_MAX, ULONG_MAX
+#if (SIZE_MAX == UINT_MAX)
   return __builtin_umul_overflow(count, size, total);
-#else
+#elif (SIZE_MAX == ULONG_MAX)
   return __builtin_umull_overflow(count, size, total);
+#else
+  return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
   *total = count * size;
@@ -235,6 +244,10 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   return (heap != &_mi_heap_empty);
 }
 
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
 /* -----------------------------------------------------------
   Pages
 ----------------------------------------------------------- */
@@ -262,14 +275,20 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
   return segment;
 }
 
-// Get the page containing the pointer
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+// used internally
+static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
   // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
   ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
   mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
   uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
   mi_assert_internal(idx < segment->capacity);
   mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
   return &((mi_segment_t*)segment)->pages[idx];
 }
 
@@ -342,59 +361,85 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
 // Page flags
 //-----------------------------------------------------------
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.in_full;
+  return page->flags.x.in_full;
 }
 
 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.in_full = in_full;
+  page->flags.x.in_full = in_full;
 }
 
 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.has_aligned;
+  return page->flags.x.has_aligned;
 }
 
 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.has_aligned = has_aligned;
+  page->flags.x.has_aligned = has_aligned;
 }
 
 
 // -------------------------------------------------------------------
 // Encoding/Decoding the free list next pointers
+// Note: we pass a `null` value to be used as the `NULL` value for the 
+// end of a free list. This is to prevent the cookie itself to ever 
+// be present among user blocks (as `cookie^0==cookie`).
 // -------------------------------------------------------------------
 
-static inline mi_block_t* mi_block_nextx( uintptr_t cookie, mi_block_t* block ) {
-  #if MI_SECURE
-  return (mi_block_t*)(block->next ^ cookie);
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  uintptr_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  uintptr_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t cookie ) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* b = (mi_block_t*)(block->next ^ cookie);
+  if (mi_unlikely((void*)b==null)) { b = NULL; }
+  return b;
   #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
   return (mi_block_t*)block->next;
   #endif
 }
 
-static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, mi_block_t* next) {
-  #if MI_SECURE
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t cookie) {
+  #ifdef MI_ENCODE_FREELIST
+  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
   block->next = (mi_encoded_t)next ^ cookie;
   #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
   block->next = (mi_encoded_t)next;
   #endif
 }
 
-static inline mi_block_t* mi_block_next(mi_page_t* page, mi_block_t* block) {
-  #if MI_SECURE
-  return mi_block_nextx(page->cookie,block);
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->cookie);
+  // check for free list corruption: is `next` at least in our segment range?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
+    next = NULL;
+  }   
+  return next;
   #else
   UNUSED(page);
-  return mi_block_nextx(0, block);
+  return mi_block_nextx(page,block,0);
   #endif
 }
 
-static inline void mi_block_set_next(mi_page_t* page, mi_block_t* block, mi_block_t* next) {
-  #if MI_SECURE
-  mi_block_set_nextx(page->cookie,block,next);
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->cookie);
   #else
   UNUSED(page);
-  mi_block_set_nextx(0, block, next);
+  mi_block_set_nextx(page,block, next,0);
   #endif
 }
 
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 5fbb9ea5..055d1204 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -22,8 +22,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
 
-// Define MI_SECURE as 1 to encode free lists
-// #define MI_SECURE 1
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
 
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@@ -33,17 +36,23 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_USER_CLEANUP 0
 #endif
 
-// Define MI_DEBUG as 1 for basic assert checks and statistics
-// set it to 2 to do internal asserts,
-// and to 3 to do extensive invariant checking.
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
 #if !defined(NDEBUG) || defined(_DEBUG)
-#define MI_DEBUG 1
+#define MI_DEBUG 2
 #else
 #define MI_DEBUG 0
 #endif
 #endif
 
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1) 
+#define MI_ENCODE_FREELIST  1
+#endif
 
 // ------------------------------------------------------
 // Platform specific values
@@ -118,6 +127,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "define more bins"
 #endif
 
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
 typedef uintptr_t mi_encoded_t;
 
 // free lists contain blocks
@@ -126,6 +137,7 @@ typedef struct mi_block_s {
 } mi_block_t;
 
 
+// The delayed flags are used for efficient multi-threaded free-ing
 typedef enum mi_delayed_e {
   MI_NO_DELAYED_FREE = 0,
   MI_USE_DELAYED_FREE = 1,
@@ -135,15 +147,13 @@ typedef enum mi_delayed_e {
 
 
 // The `in_full` and `has_aligned` page flags are put in a union to efficiently 
-// test if both are false (`value == 0`) in the `mi_free` routine.
-typedef union mi_page_flags_u {
-  uint16_t value;
-  uint8_t  full_aligned;
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
   struct {
-    bool in_full:1;
-    bool has_aligned:1;
-    bool is_zero;       // `true` if the blocks in the free list are zero initialized
-  };
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x; 
 } mi_page_flags_t;
 
 // Thread free list.
@@ -171,18 +181,19 @@ typedef uintptr_t mi_thread_free_t;
 typedef struct mi_page_s {
   // "owned" by the segment
   uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
-  bool                  segment_in_use:1;  // `true` if the segment allocated this page
-  bool                  is_reset:1;        // `true` if the page memory was reset
-  bool                  is_committed:1;    // `true` if the page virtual memory is committed
-  bool                  is_zero_init:1;    // `true` if the page was zero initialized
+  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
+  uint8_t               is_reset:1;        // `true` if the page memory was reset
+  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init:1;    // `true` if the page was zero initialized
   
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (16 bits)
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  bool                  is_zero;           // `true` if the blocks in the free list are zero initialized
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  #if MI_SECURE
+  #ifdef MI_ENCODE_FREELIST
   uintptr_t             cookie;            // random cookie to encode the free lists
   #endif
   size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
@@ -199,8 +210,8 @@ typedef struct mi_page_s {
 
   // improve page index calculation
   // without padding: 10 words on 64-bit, 11 on 32-bit. Secure adds one word
-  #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
-  void*                 padding[1];        // 12 words on 64-bit in secure mode, 12 words on 32-bit plain
+  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
+  void*                 padding[1];        // 12 words on 64-bit with cookie, 12 words on 32-bit plain
   #endif
 } mi_page_t;
 
@@ -342,14 +353,14 @@ typedef struct mi_stats_s {
   mi_stat_count_t page_committed;
   mi_stat_count_t segments_abandoned;
   mi_stat_count_t pages_abandoned;
-  mi_stat_count_t pages_extended;
-  mi_stat_count_t mmap_calls;
-  mi_stat_count_t commit_calls;
   mi_stat_count_t threads;
   mi_stat_count_t huge;
   mi_stat_count_t giant;
   mi_stat_count_t malloc;
   mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
   mi_stat_counter_t page_no_retire;
   mi_stat_counter_t searches;
   mi_stat_counter_t huge_count;
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 194edb47..540606b2 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 110   // major + 2 digits minor
+#define MI_MALLOC_VERSION 120   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -274,6 +274,7 @@ typedef enum mi_option_e {
   mi_option_eager_commit_delay,
   mi_option_segment_reset,
   mi_option_os_tag,
+  mi_option_max_errors,
   _mi_option_last
 } mi_option_t;
 
diff --git a/readme.md b/readme.md
index c803646c..3ccdbf17 100644
--- a/readme.md
+++ b/readme.md
@@ -1,7 +1,7 @@
 
 <img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
 
-[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=master"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
 
 # mimalloc
 
@@ -37,7 +37,7 @@ Notable aspects of the design include:
   programs.
 - __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
   randomized allocation, encrypted free lists, etc. to protect against various
-  heap vulnerabilities. The performance penalty is only around 3% on average
+  heap vulnerabilities. The performance penalty is usually around 10% on average
   over our benchmarks.
 - __user function for clean up memory__: _mimalloc_ can be built with MI_USER_CLEANUP=ON flag. This mode
   allows setup user function for memory clean up before it returned to system.
@@ -58,6 +58,7 @@ Enjoy!
 
 ### Releases
 
+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
 * 2019-10-07, `v1.1.0`: stable release 1.1.
 * 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
 * 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
@@ -66,7 +67,7 @@ Enjoy!
 
 ## Windows
 
-Open `ide/vs2017/mimalloc.sln` in Visual Studio 2017 and build.
+Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build (or `ide/vs2017/mimalloc.sln`).
 The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
@@ -99,7 +100,7 @@ maintains detailed statistics as:
 This will name the shared library as `libmimalloc-debug.so`.
 
 Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc, as:
+free lists, etc., as:
 ```
 > mkdir -p out/secure
 > cd out/secure
@@ -140,6 +141,9 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 ```
 to link with the static library. See `test\CMakeLists.txt` for an example.
 
+For best performance in C++ programs, it is also recommended to override the
+global `new` and `delete` operators. For convience, mimalloc provides
+[mimalloc-new-delete.h](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 
 You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`)
 and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
@@ -190,18 +194,18 @@ or via environment variables.
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
-- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly 
+- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly
    improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
    to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
    can have fragmented memory.
 - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it 
-   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better 
+   show in the working set even though usually just a small part is committed to physical memory. This is why it
+   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better
    to turn it on as it improves performance and has no other drawbacks.
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB huge OS pages. This reserves the huge pages at
-   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use 
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving 
+   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use
+   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented. Still experimental.
 
 [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
@@ -213,7 +217,7 @@ Overriding the standard `malloc` can be done either _dynamically_ or _statically
 
 ## Dynamic override
 
-This is the recommended way to override the standard malloc interface. 
+This is the recommended way to override the standard malloc interface.
 
 ### Linux, BSD
 
@@ -246,29 +250,31 @@ resolved to the _mimalloc_ library.
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
-Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this 
+Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this
 (see issue [`#50`](https://github.com/microsoft/mimalloc/issues/50)).
 
 ### Windows
 
 On Windows you need to link your program explicitly with the mimalloc
-DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 
-Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available 
-in the same folder as the mimalloc DLL at runtime (as it as referred to by the mimalloc DLL). 
-The redirection DLL's ensure all calls to the C runtime malloc API get redirected to mimalloc. 
+DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available
+in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
+The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+mimalloc (in `mimalloc-override.dll`).
 
 To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()` 
+call to the mimalloc API in the `main` function, like `mi_version()`
 (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this.
+for an example on how to use this. For best performance on Windows with C++, it
+is highly recommended to also override the `new`/`delete` operations (as described
+in the introduction).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
 
-(Note: in principle, it should be possible to patch existing executables 
-that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the mimalloc DLL into
-the import table (and putting `mimalloc-redirect.dll` in the same folder) 
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)). 
+(Note: in principle, it is possible to patch existing executables
+that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the `mimalloc-override.dll` into the import table (and putting `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
 
 
 ## Static override
@@ -284,6 +290,12 @@ object file. For example:
 > gcc -o myprogram mimalloc-override.o  myfile1.c ...
 ```
 
+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+
 
 # Performance
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 352f07b2..5a59a63a 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -61,53 +61,53 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
 }
 
 
-void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
 }
 
-void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }
 
-void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
 }
 
-void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
 }
 
-void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(count, size, &total)) return NULL;
   return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
 }
 
-void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
 }
 
-void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }
 
-void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
 }
 
-void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }
 
-void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
 }
 
-void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
 }
 
-void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
 }
 
@@ -126,7 +126,7 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
     if (newp != NULL) {
       if (zero && newsize > size) {
         const mi_page_t* page = _mi_ptr_page(newp);
-        if (page->flags.is_zero) {
+        if (page->is_zero) {
           // already zero initialized
           mi_assert_expensive(mi_mem_is_zero(newp,newsize));
         }
@@ -150,55 +150,55 @@ static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsi
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
 }
 
-void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
 }
 
-void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
 }
 
-void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
 }
 
-void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
 }
 
-void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
 }
 
-void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned(heap, p, total, alignment);
 }
 
-void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }
 
-void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }
 
-void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }
 
-void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }
 
-void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
 }
 
-void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
 }
 
diff --git a/src/alloc-override-win.c b/src/alloc-override-win.c
deleted file mode 100644
index dc4796ab..00000000
--- a/src/alloc-override-win.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-#if !defined(_WIN32)
-#error "this file should only be included on Windows"
-#endif
-
-#include <windows.h>
-#include <psapi.h>
-
-#include <stdlib.h> // getenv
-#include <stdio.h>  // _setmaxstdio
-#include <string.h> // strstr
-
-
-/*
-To override the C runtime `malloc` on Windows we need to patch the allocation
-functions at runtime initialization. Unfortunately we can never patch before the
-runtime initializes itself, because as soon as we call `GetProcAddress` on the
-runtime module (a DLL or EXE in Windows speak), it will first load and initialize
-(by the OS calling `DllMain` on it).
-
-This means that some things might be already allocated by the C runtime itself
-(and possibly other DLL's) before we get to resolve runtime adresses. This is
-no problem if everyone unwinds in order: when we unload, we unpatch and restore
-the original crt `free` routines and crt malloc'd memory is freed correctly.
-
-But things go wrong if such early CRT alloc'd memory is freed or re-allocated
-_after_ we patch, but _before_ we unload (and unpatch), or if any memory allocated
-by us is freed after we unpatched.
-
-There are two tricky situations to deal with:
-
-1. The Thread Local Storage (TLS): when the main thread stops it will call registered
-   callbacks on TLS entries (allocated by `FlsAlloc`). This is done by the OS
-   before any DLL's are unloaded. Unfortunately, the C runtime registers such
-   TLS entries with CRT allocated memory which is freed in the callback.
-
-2. Inside the CRT:
-   a. Some variables might get initialized by patched allocated
-      blocks but freed during CRT unloading after we unpatched
-      (like temporary file buffers).
-   b. Some blocks are allocated at CRT and freed by the CRT (like the
-      environment storage).
-   c. And some blocks are allocated by the CRT and then reallocated
-      while patched, and finally freed after unpatching! This
-      happens with the `atexit` functions for example to grow the array
-      of registered functions.
-
-In principle situation 2 is hopeless: since we cannot patch before CRT initialization,
-we can never be sure how to free or reallocate a pointer during CRT unloading.
-However, in practice there is a good solution: when terminating, we just patch
-the reallocation and free routines to no-ops -- we are winding down anyway! This leaves
-just the reallocation problm of CRT alloc'd memory once we are patched. Here, a study of the
-CRT reveals that there seem to be just three such situations:
-
-1. When registering `atexit` routines (to grow the exit function table),
-2. When calling `_setmaxstdio` (to grow the file handle table),
-3. and `_popen`/`_wpopen` (to grow handle pairs). These turn out not to be
-   a problem as these are NULL initialized.
-
-We fix these by providing wrappers:
-
-1. We first register a _global_ `atexit` routine ourselves (`mi_patches_at_exit`) before patching,
-   and then patch the `_crt_atexit` function to implement our own global exit list (and the
-   same for `_crt_at_quick_exit`). All module local lists are no problem since they are always fully
-   (un)patched from initialization to end. We can register in the global list by dynamically
-   getting the global `_crt_atexit` entry from `ucrtbase.dll`.
-
-2. The `_setmaxstdio`  is _detoured_: we patch it by a stub that unpatches first,
-   calls the original routine and repatches again.
-
-That leaves us to reliably shutdown and enter "termination mode":
-
-1. Using our trick to get the global exit list entry point, we register an exit function `mi_patches_atexit`
-   that first executes all our home brew list of exit functions, and then enters a _termination_
-   phase that patches realloc/free variants with no-ops. Patching later again with special no-ops for
-   `free` also improves efficiency during the program run since no flags need to be checked.
-
-2. That is not quite good enough yet since after executing exit routines after us on the
-   global exit list (registered by the CRT),
-   the OS starts to unwind the TLS callbacks and we would like to run callbacks registered after loading
-   our DLL to be done in patched mode. So, we also allocate a TLS entry when our DLL is loaded and when its
-   callback is called, we re-enable the original patches again. Since TLS is destroyed in FIFO order
-   this runs any callbacks in later DLL's in patched mode.
-
-3. Finally the DLL's get unloaded by the OS in order (still patched) until our DLL gets unloaded
-   and then we start a termination phase again, and patch realloc/free with no-ops for good this time.
-
-*/
-
-static int __cdecl mi_setmaxstdio(int newmax);
-
-// ------------------------------------------------------
-// Microsoft allocation extensions
-// ------------------------------------------------------
-
-
-typedef size_t mi_nothrow_t;
-
-static void mi_free_nothrow(void* p, mi_nothrow_t tag) {
-  UNUSED(tag);
-  mi_free(p);
-}
-
-// Versions of `free`, `realloc`, `recalloc`, `expand` and `msize`
-// that are used during termination and are no-ops.
-static void mi_free_term(void* p) {
-  UNUSED(p);
-}
-
-static void mi_free_size_term(void* p, size_t size) {
-  UNUSED(size);
-  UNUSED(p);
-}
-
-static void mi_free_nothrow_term(void* p, mi_nothrow_t tag) {
-  UNUSED(tag);
-  UNUSED(p);
-}
-
-static void* mi_realloc_term(void* p, size_t newsize) {
-  UNUSED(p); UNUSED(newsize);
-  return NULL;
-}
-
-static void* mi__recalloc_term(void* p, size_t newcount, size_t newsize) {
-  UNUSED(p); UNUSED(newcount); UNUSED(newsize);
-  return NULL;
-}
-
-static void* mi__expand_term(void* p, size_t newsize) {
-  UNUSED(p); UNUSED(newsize);
-  return NULL;
-}
-
-static size_t mi__msize_term(void* p) {
-  UNUSED(p);
-  return 0;
-}
-
-
-static void* mi__malloc_dbg(size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return _malloc_base(size);
-}
-
-static void* mi__calloc_dbg(size_t count, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return _calloc_base(count, size);
-}
-
-static void* mi__realloc_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return _realloc_base(p, size);
-}
-
-static void mi__free_dbg(void* p, int block_type) {
-  UNUSED(block_type);
-  _free_base(p);
-}
-
-
-// the `recalloc`,`expand`, and `msize` don't have base versions and thus need a separate term version
-
-static void* mi__recalloc_dbg(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return mi_recalloc(p, count, size);
-}
-
-static void* mi__expand_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return mi__expand(p, size);
-}
-
-static size_t mi__msize_dbg(void* p, int block_type) {
-  UNUSED(block_type);
-  return mi_usable_size(p);
-}
-
-static void* mi__recalloc_dbg_term(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return mi__recalloc_term(p, count, size);
-}
-
-static void* mi__expand_dbg_term(void* p, size_t size, int block_type, const char* fname, int line) {
-  UNUSED(block_type); UNUSED(fname); UNUSED(line);
-  return mi__expand_term(p, size);
-}
-
-static size_t mi__msize_dbg_term(void* p, int block_type) {
-  UNUSED(block_type);
-  return mi__msize_term(p);
-}
-
-
-// ------------------------------------------------------
-// implement our own global atexit handler
-// ------------------------------------------------------
-typedef void (cbfun_t)(void);
-typedef int (atexit_fun_t)(cbfun_t* fn);
-typedef uintptr_t encoded_t;
-
-typedef struct exit_list_s {
-  encoded_t  functions;  // encoded pointer to array of encoded function pointers
-  size_t     count;
-  size_t     capacity;
-} exit_list_t;
-
-#define MI_EXIT_INC (64)
-
-static exit_list_t atexit_list = { 0, 0, 0 };
-static exit_list_t at_quick_exit_list = { 0, 0, 0 };
-static CRITICAL_SECTION atexit_lock;
-
-// encode/decode function pointers with a random canary for security
-static encoded_t canary;
-
-static inline void *decode(encoded_t x) {
-  return (void*)(x^canary);
-}
-
-static inline encoded_t encode(void* p) {
-  return ((uintptr_t)p ^ canary);
-}
-
-
-static void init_canary()
-{
-  canary = _mi_random_init(0);
-  atexit_list.functions = at_quick_exit_list.functions = encode(NULL);
-}
-
-
-// initialize the list
-static void mi_initialize_atexit(void) {
-  InitializeCriticalSection(&atexit_lock);
-  init_canary();
-}
-
-// register an exit function
-static int mi_register_atexit(exit_list_t* list, cbfun_t* fn) {
-  if (fn == NULL) return EINVAL;
-  EnterCriticalSection(&atexit_lock);
-  encoded_t* functions = (encoded_t*)decode(list->functions);
-  if (list->count >= list->capacity) {   // at first `functions == decode(0) == NULL`
-    encoded_t* newf = (encoded_t*)mi_recalloc(functions, list->capacity + MI_EXIT_INC, sizeof(cbfun_t*));
-    if (newf != NULL) {
-      list->capacity += MI_EXIT_INC;
-      list->functions = encode(newf);
-      functions = newf;
-    }
-  }
-  int result;
-  if (list->count < list->capacity && functions != NULL) {
-    functions[list->count] = encode(fn);
-    list->count++;
-    result = 0; // success
-  }
-  else {
-    result = ENOMEM;
-  }
-  LeaveCriticalSection(&atexit_lock);
-  return result;
-}
-
-// Register a global `atexit` function
-static int mi_atexit(cbfun_t* fn) {
-  return mi_register_atexit(&atexit_list,fn);
-}
-
-static int mi_at_quick_exit(cbfun_t* fn) {
-  return mi_register_atexit(&at_quick_exit_list,fn);
-}
-
-static int mi_register_onexit(void* table, cbfun_t* fn) {
-  // TODO: how can we distinguish a quick_exit from atexit?
-  return mi_atexit(fn);
-}
-
-// Execute exit functions in a list
-static void mi_execute_exit_list(exit_list_t* list) {
-  // copy and zero the list structure
-  EnterCriticalSection(&atexit_lock);
-  exit_list_t clist = *list;
-  memset(list,0,sizeof(*list));
-  LeaveCriticalSection(&atexit_lock);
-
-  // now execute the functions outside of the lock
-  encoded_t* functions = (encoded_t*)decode(clist.functions);
-  if (functions != NULL) {
-    for (size_t i = clist.count; i > 0; i--) {  // careful with unsigned count down..
-      cbfun_t* fn = (cbfun_t*)decode(functions[i-1]);
-      if (fn==NULL) break; // corrupted!
-      fn();
-    }
-    mi_free(functions);
-  }
-}
-
-
-
-// ------------------------------------------------------
-// Jump assembly instructions for patches
-// ------------------------------------------------------
-
-#if defined(_M_IX86) || defined(_M_X64)
-
-#define MI_JUMP_SIZE  14   // at most 2+4+8 for a long jump or 1+5 for a short one
-
-typedef struct mi_jump_s {
-  uint8_t opcodes[MI_JUMP_SIZE];
-} mi_jump_t;
-
-void mi_jump_restore(void* current, const mi_jump_t* saved) {
-  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
-}
-
-void mi_jump_write(void* current, void* target, mi_jump_t* save) {
-  if (save != NULL) {
-    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
-  }
-  uint8_t*   opcodes = ((mi_jump_t*)current)->opcodes;
-  ptrdiff_t  diff    = (uint8_t*)target - (uint8_t*)current;
-  uint32_t   ofs32   = (uint32_t)diff;
-  #ifdef _M_X64
-  uint64_t   ofs64   = (uint64_t)diff;
-  if (ofs64 != (uint64_t)ofs32) {
-    // use long jump
-    opcodes[0] = 0xFF;
-    opcodes[1] = 0x25;
-    *((uint32_t*)&opcodes[2]) = 0;
-    *((uint64_t*)&opcodes[6]) = (uint64_t)target;
-  }
-  else
-  #endif
-  {
-    // use short jump
-    opcodes[0] = 0xE9;
-    *((uint32_t*)&opcodes[1]) = ofs32 - 5 /* size of the short jump instruction */;
-  }
-}
-
-#elif defined(_M_ARM64)
-
-#define MI_JUMP_SIZE  16
-
-typedef struct mi_jump_s {
-  uint8_t opcodes[MI_JUMP_SIZE];
-} mi_jump_t;
-
-void mi_jump_restore(void* current, const mi_jump_t* saved) {
-  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
-}
-
-void mi_jump_write(void* current, void* target, mi_jump_t* save) {
-  if (save != NULL) {
-    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
-  }
-  uint8_t*  opcodes = ((mi_jump_t*)current)->opcodes;
-  uint64_t  diff = (uint8_t*)target - (uint8_t*)current;
-
-  // 0x50 0x00 0x00 0x58   ldr x16, .+8   # load PC relative +8
-  // 0x00 0x02 0x3F 0xD6   blr x16        # and jump
-  //                       <address>
-  //                       <address>
-  static const uint8_t jump_opcodes[8] = { 0x50, 0x00, 0x00, 0x58, 0x00, 0x02, 0x3F, 0xD6 };
-  memcpy(&opcodes[0], jump_opcodes, sizeof(jump_opcodes));
-  *((uint64_t*)&opcodes[8]) = diff;
-}
-
-#else
-#error "define jump instructions for this platform"
-#endif
-
-
-// ------------------------------------------------------
-// Patches
-// ------------------------------------------------------
-typedef enum patch_apply_e {
-  PATCH_NONE,
-  PATCH_TARGET,
-  PATCH_TARGET_TERM
-} patch_apply_t;
-
-#define MAX_ENTRIES  4      // maximum number of patched entry points (like `malloc` in ucrtbase and msvcrt)
-
-typedef struct mi_patch_s {
-  const char*   name;                   // name of the function to patch
-  void*         target;                 // the address of the new target (never NULL)
-  void*         target_term;            // the address of the target during termination (or NULL)
-  patch_apply_t applied;                // what target has been applied?
-  void*         originals[MAX_ENTRIES]; // the resolved addresses of the function (or NULLs)
-  mi_jump_t     saves[MAX_ENTRIES];     // the saved instructions in case it was applied
-} mi_patch_t;
-
-#define MI_PATCH_NAME3(name,target,term)  { name, &target, &term, PATCH_NONE, {NULL,NULL,NULL,NULL} }
-#define MI_PATCH_NAME2(name,target)       { name, &target, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} }
-#define MI_PATCH3(name,target,term)       MI_PATCH_NAME3(#name, target, term)
-#define MI_PATCH2(name,target)            MI_PATCH_NAME2(#name, target)
-#define MI_PATCH1(name)                   MI_PATCH2(name,mi_##name)
-
-static mi_patch_t patches[] = {
-  // we implement our own global exit handler (as the CRT versions do a realloc internally)
-  //MI_PATCH2(_crt_atexit, mi_atexit),
-  //MI_PATCH2(_crt_at_quick_exit, mi_at_quick_exit),
-  MI_PATCH2(_setmaxstdio, mi_setmaxstdio),
-  MI_PATCH2(_register_onexit_function, mi_register_onexit),
-
-  // override higher level atexit functions so we can implement at_quick_exit correcty
-  MI_PATCH2(atexit, mi_atexit),
-  MI_PATCH2(at_quick_exit, mi_at_quick_exit),
-
-  // regular entries
-  MI_PATCH2(malloc, mi_malloc),
-  MI_PATCH2(calloc, mi_calloc),
-  MI_PATCH3(realloc, mi_realloc,mi_realloc_term),
-  MI_PATCH3(free, mi_free,mi_free_term),
-  
-  // extended api
-  MI_PATCH2(_strdup, mi_strdup),
-  MI_PATCH2(_strndup, mi_strndup),
-  MI_PATCH3(_expand, mi__expand,mi__expand_term),
-  MI_PATCH3(_recalloc, mi_recalloc,mi__recalloc_term),
-  MI_PATCH3(_msize, mi_usable_size,mi__msize_term),
-
-  // base versions 
-  MI_PATCH2(_malloc_base, mi_malloc),
-  MI_PATCH2(_calloc_base, mi_calloc),
-  MI_PATCH3(_realloc_base, mi_realloc,mi_realloc_term),
-  MI_PATCH3(_free_base, mi_free,mi_free_term),
-
-  // these base versions are in the crt but without import records
-  MI_PATCH_NAME3("_recalloc_base", mi_recalloc,mi__recalloc_term),
-  MI_PATCH_NAME3("_msize_base", mi_usable_size,mi__msize_term),
-
-  // debug
-  MI_PATCH2(_malloc_dbg, mi__malloc_dbg),
-  MI_PATCH2(_realloc_dbg, mi__realloc_dbg),
-  MI_PATCH2(_calloc_dbg, mi__calloc_dbg),
-  MI_PATCH2(_free_dbg, mi__free_dbg),
-
-  MI_PATCH3(_expand_dbg, mi__expand_dbg, mi__expand_dbg_term),
-  MI_PATCH3(_recalloc_dbg, mi__recalloc_dbg, mi__recalloc_dbg_term),
-  MI_PATCH3(_msize_dbg, mi__msize_dbg, mi__msize_dbg_term),
-
-#if 0
-  // override new/delete variants for efficiency (?)
-#ifdef _WIN64
-  // 64 bit new/delete
-  MI_PATCH_NAME2("??2@YAPEAX_K@Z", mi_new),
-  MI_PATCH_NAME2("??_U@YAPEAX_K@Z", mi_new),
-  MI_PATCH_NAME3("??3@YAXPEAX@Z", mi_free, mi_free_term),  
-  MI_PATCH_NAME3("??_V@YAXPEAX@Z", mi_free, mi_free_term), 
-  MI_PATCH_NAME3("??3@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized
-  MI_PATCH_NAME3("??_V@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized
-  MI_PATCH_NAME2("??2@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new),
-  MI_PATCH_NAME2("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new),
-  MI_PATCH_NAME3("??3@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
-  MI_PATCH_NAME3("??_V@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
-  
-  
-#else
-  // 32 bit new/delete
-  MI_PATCH_NAME2("??2@YAPAXI@Z", mi_new),
-  MI_PATCH_NAME2("??_U@YAPAXI@Z", mi_new),
-  MI_PATCH_NAME3("??3@YAXPAX@Z", mi_free, mi_free_term),
-  MI_PATCH_NAME3("??_V@YAXPAX@Z", mi_free, mi_free_term),
-  MI_PATCH_NAME3("??3@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized
-  MI_PATCH_NAME3("??_V@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized
-
-  MI_PATCH_NAME2("??2@YAPAXIABUnothrow_t@std@@@Z", mi_new),
-  MI_PATCH_NAME2("??_U@YAPAXIABUnothrow_t@std@@@Z", mi_new),
-  MI_PATCH_NAME3("??3@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
-  MI_PATCH_NAME3("??_V@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
-
-#endif
-#endif
-  { NULL, NULL, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} }
-};
-
-
-// Apply a patch
-static bool mi_patch_apply(mi_patch_t* patch, patch_apply_t apply)
-{
-  if (patch->originals[0] == NULL) return true;  // unresolved
-  if (apply == PATCH_TARGET_TERM && patch->target_term == NULL) apply = PATCH_TARGET;  // avoid re-applying non-term variants
-  if (patch->applied == apply) return false;
-
-  for (int i = 0; i < MAX_ENTRIES; i++) {
-    void* original = patch->originals[i];
-    if (original == NULL) break; // no more
-
-    DWORD protect = PAGE_READWRITE;
-    if (!VirtualProtect(original, MI_JUMP_SIZE, PAGE_EXECUTE_READWRITE, &protect)) return false;
-    if (apply == PATCH_NONE) {
-      mi_jump_restore(original, &patch->saves[i]);
-    }
-    else {
-      void* target = (apply == PATCH_TARGET ? patch->target : patch->target_term);
-      mi_assert_internal(target != NULL);
-      if (target != NULL) mi_jump_write(original, target, &patch->saves[i]);
-    }
-    VirtualProtect(original, MI_JUMP_SIZE, protect, &protect);
-  }
-  patch->applied = apply;
-  return true;
-}
-
-// Apply all patches
-static bool _mi_patches_apply(patch_apply_t apply, patch_apply_t* previous) {
-  static patch_apply_t current = PATCH_NONE;
-  if (previous != NULL) *previous = current;
-  if (current == apply) return true;
-  current = apply;
-  bool ok = true;
-  for (size_t i = 0; patches[i].name != NULL; i++) {
-    if (!mi_patch_apply(&patches[i], apply)) ok = false;
-  }
-  return ok;
-}
-
-// Export the following three functions just in case
-// a user needs that level of control.
-
-// Disable all patches
-mi_decl_export void mi_patches_disable(void) {
-  _mi_patches_apply(PATCH_NONE, NULL);
-}
-
-// Enable all patches normally
-mi_decl_export bool mi_patches_enable(void) {
-  return _mi_patches_apply( PATCH_TARGET, NULL );
-}
-
-// Enable all patches in termination phase where free is a no-op
-mi_decl_export bool mi_patches_enable_term(void) {
-  return _mi_patches_apply(PATCH_TARGET_TERM, NULL);
-}
-
-// ------------------------------------------------------
-// Stub for _setmaxstdio
-// ------------------------------------------------------
-
-static int __cdecl mi_setmaxstdio(int newmax) {
-  patch_apply_t previous;
-  _mi_patches_apply(PATCH_NONE, &previous); // disable patches
-  int result = _setmaxstdio(newmax);       // call original function (that calls original CRT recalloc)
-  _mi_patches_apply(previous,NULL);         // and re-enable patches
-  return result;
-}
-
-
-// ------------------------------------------------------
-// Resolve addresses dynamically
-// ------------------------------------------------------
-
-// Try to resolve patches for a given module (DLL)
-static void mi_module_resolve(const char* fname, HMODULE mod, int priority) {
-  // see if any patches apply
-  for (size_t i = 0; patches[i].name != NULL; i++) {
-    mi_patch_t* patch = &patches[i];
-    if (patch->applied == PATCH_NONE) {
-      // find an available entry
-      int i = 0;
-      while (i < MAX_ENTRIES && patch->originals[i] != NULL) i++;
-      if (i < MAX_ENTRIES) {
-        void* addr = GetProcAddress(mod, patch->name);
-        if (addr != NULL) {
-          // found it! set the address
-          patch->originals[i] = addr;          
-          _mi_trace_message("  found %s at %s!%p (entry %i)\n", patch->name, fname, addr, i);
-        }
-      }
-    }
-  }
-}
-
-#define MIMALLOC_NAME "mimalloc-override.dll"
-#define UCRTBASE_NAME "ucrtbase.dll"
-#define UCRTBASED_NAME "ucrtbased.dll"
-
-// Resolve addresses of all patches by inspecting the loaded modules
-static atexit_fun_t* crt_atexit = NULL;
-static atexit_fun_t* crt_at_quick_exit = NULL;
-
-
-static bool mi_patches_resolve(void) {
-  // get all loaded modules
-  HANDLE process = GetCurrentProcess(); // always -1, no need to release
-  DWORD needed = 0;
-  HMODULE modules[400];  // try to stay under 4k to not trigger the guard page
-  EnumProcessModules(process, modules, sizeof(modules), &needed);
-  if (needed == 0) return false;
-  int count = needed / sizeof(HMODULE);
-  int ucrtbase_index = 0;
-  int mimalloc_index = 0;
-  // iterate through the loaded modules
-  for (int i = 0; i < count;  i++) {
-    HMODULE mod = modules[i];
-    char filename[MAX_PATH] = { 0 };
-    DWORD slen = GetModuleFileName(mod, filename, MAX_PATH);
-    if (slen > 0 && slen < MAX_PATH) {
-      // filter out potential crt modules only
-      filename[slen] = 0;
-      const char* lastsep = strrchr(filename, '\\');
-      const char* basename = (lastsep==NULL ? filename : lastsep+1);
-      _mi_trace_message("  %i: dynamic module %s\n", i, filename);
-
-      // remember indices so we can check load order (in debug mode)
-      if (_stricmp(basename, MIMALLOC_NAME) == 0) mimalloc_index = i;
-      if (_stricmp(basename, UCRTBASE_NAME) == 0) ucrtbase_index = i;
-      if (_stricmp(basename, UCRTBASED_NAME) == 0) ucrtbase_index = i;
-
-      // see if we potentially patch in this module
-      int priority = 0; 
-      if (i == 0) priority = 2; // main module to allow static crt linking
-      else if (_strnicmp(basename, "ucrt", 4) == 0) priority = 3;   // new ucrtbase.dll in windows 10
-      // NOTE: don't override msvcr -- leads to crashes in setlocale (needs more testing)
-      // else if (_strnicmp(basename, "msvcr", 5) == 0) priority = 1;  // older runtimes      
-      
-      if (priority > 0) {
-        // probably found a crt module, try to patch it
-        mi_module_resolve(basename,mod,priority);
-
-        // try to find the atexit functions for the main process (in `ucrtbase.dll`)
-        if (crt_atexit==NULL) crt_atexit = (atexit_fun_t*)GetProcAddress(mod, "_crt_atexit");
-        if (crt_at_quick_exit == NULL) crt_at_quick_exit = (atexit_fun_t*)GetProcAddress(mod, "_crt_at_quick_exit");
-      }
-    }
-  }
-  int diff = mimalloc_index - ucrtbase_index;
-  if (diff > 1) {
-    _mi_warning_message("warning: the \"mimalloc-override\" DLL seems not to load before or right after the C runtime (\"ucrtbase\").\n"
-                        "  Try to fix this by changing the linking order.\n");
-  }
-  return true;
-}
-
-
-// ------------------------------------------------------
-// Dll Entry
-// ------------------------------------------------------
-
-extern BOOL WINAPI _DllMainCRTStartup(HINSTANCE inst, DWORD reason, LPVOID reserved);
-
-static DWORD mi_fls_unwind_entry;
-static void NTAPI mi_fls_unwind(PVOID value) {
-  if (value != NULL) mi_patches_enable();   // and re-enable normal patches again for DLL's loaded after us
-  return;
-}
-
-static void mi_patches_atexit(void) {
-  mi_execute_exit_list(&atexit_list);
-  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
-}
-
-static void mi_patches_at_quick_exit(void) {
-  mi_execute_exit_list(&at_quick_exit_list);
-  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
-}
-
-BOOL WINAPI DllEntry(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-  if (reason == DLL_PROCESS_ATTACH) {
-    __security_init_cookie();
-  }
-  else if (reason == DLL_PROCESS_DETACH) {
-    // enter termination phase for good now
-    mi_patches_enable_term();
-  }
-  // C runtime main
-  BOOL ok = _DllMainCRTStartup(inst, reason, reserved);  
-  if (reason == DLL_PROCESS_ATTACH && ok) {
-    // initialize at exit lists
-    mi_initialize_atexit();
-
-    // Now resolve patches
-    ok = mi_patches_resolve();
-    if (ok) {
-      // check if patching is not disabled
-      #pragma warning(suppress:4996)
-      const char* s = getenv("MIMALLOC_DISABLE_OVERRIDE");
-      bool enabled = (s == NULL || !(strstr("1;TRUE;YES;ON", s) != NULL));
-      if (!enabled) {
-        _mi_verbose_message("override is disabled\n");
-      }
-      else {        
-        // and register our unwind entry (this must be after resolving due to possible delayed DLL initialization from GetProcAddress)
-        mi_fls_unwind_entry = FlsAlloc(&mi_fls_unwind);
-        if (mi_fls_unwind_entry != FLS_OUT_OF_INDEXES) {
-          FlsSetValue(mi_fls_unwind_entry, (void*)1);
-        }
-
-        // register our patch disabler in the global exit list
-        if (crt_atexit != NULL)        (*crt_atexit)(&mi_patches_atexit);
-        if (crt_at_quick_exit != NULL) (*crt_at_quick_exit)(&mi_patches_at_quick_exit);
-
-        // and patch !  this also redirects the `atexit` handling for the global exit list
-        mi_patches_enable();
-        _mi_verbose_message("override is enabled\n");
-
-        // hide internal allocation
-        mi_stats_reset();
-      }
-    }
-  }
-  return ok;
-}
diff --git a/src/alloc.c b/src/alloc.c
index 9d50bf9f..e68b48d2 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -32,10 +32,10 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   page->free = mi_block_next(page,block);
   page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
-#if (MI_DEBUG)
-  if (!page->flags.is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
-#elif (MI_SECURE)
-  block->next = 0;
+#if (MI_DEBUG!=0)
+  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
+#elif (MI_SECURE!=0)
+  block->next = 0;  // don't leak internal data
 #endif
 #if (MI_STAT>1)
   if(size <= MI_LARGE_OBJ_SIZE_MAX) {
@@ -47,26 +47,26 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 }
 
 // allocate a small block
-extern inline void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   mi_assert(size <= MI_SMALL_SIZE_MAX);
   mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
   return _mi_page_malloc(heap, page, size);
 }
 
-extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
   return mi_heap_malloc_small(mi_get_default_heap(), size);
 }
 
 
 // zero initialized small block
-void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
   void* p = mi_malloc_small(size);
   if (p != NULL) { memset(p, 0, size); }
   return p;
 }
 
 // The main allocation function
-extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
   void* p;
@@ -85,7 +85,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep
   return p;
 }
 
-extern inline void* mi_malloc(size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
   return mi_heap_malloc(mi_get_default_heap(), size);
 }
 
@@ -96,7 +96,7 @@ void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
   mi_assert_internal(p != NULL);
   mi_assert_internal(size > 0 && page->block_size >= size);
   mi_assert_internal(_mi_ptr_page(p)==page);
-  if (page->flags.is_zero) {
+  if (page->is_zero) {
     // already zero initialized memory?
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
     mi_assert_expensive(mi_mem_is_zero(p,page->block_size));
@@ -115,15 +115,67 @@ void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
   return p;
 }
 
-extern inline void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   return _mi_heap_malloc_zero(heap, size, true);
 }
 
-void* mi_zalloc(size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_zalloc(size_t size) mi_attr_noexcept {
   return mi_heap_zalloc(mi_get_default_heap(),size);
 }
 
 
+// ------------------------------------------------------
+// Check for double free in secure and debug mode 
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block, const mi_block_t* n) {
+  size_t psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+  if (n == NULL || ((uint8_t*)n >= pstart && (uint8_t*)n < (pstart + psize))) {
+    // Suspicious: the decoded value is in the same page (or NULL).
+    // Walk the free lists to verify positively if it is already freed
+    if (mi_list_contains(page, page->free, block) ||
+        mi_list_contains(page, page->local_free, block) ||
+        mi_list_contains(page, (const mi_block_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&page->thread_free)), block)) 
+    {
+      _mi_fatal_error("double free detected of block %p with size %zu\n", block, page->block_size);
+      return true;
+    }
+  }
+  return false;
+}
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  mi_block_t* n = mi_block_nextx(page, block, page->cookie); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&        // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_segment(block, n)))    // quick check: in same segment or NULL?
+  { 
+    // Suspicous: decoded value in block is in the same segment (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    return mi_check_is_double_freex(page, block, n);
+  }  
+  return false;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(page);
+  UNUSED(block);
+  return false;
+}
+#endif
+
+
 // ------------------------------------------------------
 // Free
 // ------------------------------------------------------
@@ -147,8 +199,16 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_block_set_next(page, block, page->free);
       page->free = block;
       page->used--;
-      page->flags.is_zero = false;
-      _mi_segment_page_free(page,true,&heap->tld->segments);
+      page->is_zero = false;
+      mi_assert(page->used == 0);
+      mi_tld_t* tld = heap->tld;
+      if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
+        _mi_stat_decrease(&tld->stats.giant, page->block_size);
+      }
+      else {
+        _mi_stat_decrease(&tld->stats.huge, page->block_size);
+      }
+      _mi_segment_page_free(page,true,&tld->segments);
     }
     return;
   }
@@ -175,14 +235,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   }
   else {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = page->heap;
+    mi_heap_t* heap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
       mi_block_t* dfree;
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie,block,dfree);
+        mi_block_set_nextx(heap,block,dfree, heap->cookie);
       } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
     }
 
@@ -206,6 +266,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
   // and push it on the free list
   if (mi_likely(local)) {
     // owning thread can free a block directly
+    if (mi_check_is_double_free(page, block)) return;
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
@@ -249,16 +310,18 @@ void mi_free(void* p) mi_attr_noexcept
   const mi_segment_t* const segment = _mi_ptr_segment(p);
   if (mi_unlikely(segment == NULL)) return;  // checks for (p==NULL)
 
-#if (MI_DEBUG>0)
+#if (MI_DEBUG!=0)
   if (mi_unlikely(!mi_is_in_heap_region(p))) {
-    _mi_warning_message("possibly trying to mi_free a pointer that does not point to a valid heap region: 0x%p\n"
+    _mi_warning_message("possibly trying to free a pointer that does not point to a valid heap region: 0x%p\n"
       "(this may still be a valid very large allocation (over 64MiB))\n", p);
     if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
       _mi_warning_message("(yes, the previous pointer 0x%p was valid after all)\n", p);
     }
   }
+#endif
+#if (MI_DEBUG!=0 || MI_SECURE>=4)
   if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message("trying to mi_free a pointer that does not point to a valid heap space: %p\n", p);
+    _mi_error_message("trying to free a pointer that does not point to a valid heap space: %p\n", p);
     return;
   }
 #endif
@@ -278,6 +341,7 @@ void mi_free(void* p) mi_attr_noexcept
   if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
     // local, and not full or aligned
     mi_block_t* block = (mi_block_t*)p;
+    if (mi_check_is_double_free(page,block)) return;    
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
@@ -360,29 +424,29 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
   mi_free(p);
 }
 
-extern inline void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern inline mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(count,size,&total)) return NULL;
   return mi_heap_zalloc(heap,total);
 }
 
-void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_calloc(mi_get_default_heap(),count,size);
 }
 
 // Uninitialized `calloc`
-extern void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(count,size,&total)) return NULL;
   return mi_heap_malloc(heap, total);
 }
 
-void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_mallocn(mi_get_default_heap(),count,size);
 }
 
 // Expand in place or fail
-void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   if (p == NULL) return NULL;
   size_t size = mi_usable_size(p);
   if (newsize > size) return NULL;
@@ -408,11 +472,11 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
   return newp;
 }
 
-void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, false);
 }
 
-void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(count, size, &total)) return NULL;
   return mi_heap_realloc(heap, p, total);
@@ -420,41 +484,41 @@ void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_a
 
 
 // Reallocate but free `p` on errors
-void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   void* newp = mi_heap_realloc(heap, p, newsize);
   if (newp==NULL && p!=NULL) mi_free(p);
   return newp;
 }
 
-void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, true);
 }
 
-void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_mul_overflow(count, size, &total)) return NULL;
   return mi_heap_rezalloc(heap, p, total);
 }
 
 
-void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_realloc(mi_get_default_heap(),p,newsize);
 }
 
-void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
 }
 
 // Reallocate but free `p` on errors
-void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
 }
 
-void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
 }
 
-void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
 }
 
diff --git a/src/heap.c b/src/heap.c
index 15c5d02a..daa9b241 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -223,7 +223,7 @@ static void mi_heap_free(mi_heap_t* heap) {
   
   // reset default
   if (mi_heap_is_default(heap)) {
-    _mi_heap_default = heap->tld->heap_backing;
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
   }
   // and free the used memory
   mi_free(heap);
@@ -354,8 +354,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
   mi_assert(mi_heap_is_initialized(heap));
   if (!mi_heap_is_initialized(heap)) return NULL;
   mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old   = _mi_heap_default;
-  _mi_heap_default = heap;
+  mi_heap_t* old = mi_get_default_heap(); 
+  _mi_heap_set_default_direct(heap);
   return old;
 }
 
diff --git a/src/init.c b/src/init.c
index 5ab39c28..81413aa9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -13,16 +13,16 @@ terms of the MIT license. A copy of the license can be found in the file
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
   0, false, false, false, false, 0, 0,
-  { 0 },
+  { 0 }, false,
   NULL,    // free
-  #if MI_SECURE
+  #if MI_ENCODE_FREELIST
   0,
   #endif
   0,       // used
   NULL, 
   ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
   0, NULL, NULL, NULL
-  #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
+  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
   , { NULL } // padding
   #endif
 };
@@ -64,8 +64,8 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 },  \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
@@ -90,6 +90,7 @@ const mi_heap_t _mi_heap_empty = {
   false
 };
 
+// the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 
@@ -184,10 +185,6 @@ uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {
   return x;
 }
 
-uintptr_t _mi_ptr_cookie(const void* p) {
-  return ((uintptr_t)p ^ _mi_heap_main.cookie);
-}
-
 /* -----------------------------------------------------------
   Initialization and freeing of the thread local heaps
 ----------------------------------------------------------- */
@@ -202,8 +199,8 @@ static bool _mi_heap_init(void) {
   if (mi_heap_is_initialized(_mi_heap_default)) return true;
   if (_mi_is_main_thread()) {
     // the main heap is statically allocated
-    _mi_heap_default = &_mi_heap_main;
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == _mi_heap_default);
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
   }
   else {
     // use `_mi_os_alloc` to allocate directly from the OS
@@ -223,18 +220,17 @@ static bool _mi_heap_init(void) {
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->os.stats = &tld->stats;
-    _mi_heap_default = heap;
+    _mi_heap_set_default_direct(heap);
   }
   return false;
 }
 
 // Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(void) {
-  mi_heap_t* heap = _mi_heap_default;
+static bool _mi_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
-  _mi_heap_default = (_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
 
   // todo: delete all non-backing heaps?
 
@@ -281,6 +277,8 @@ static bool _mi_heap_done(void) {
 // to set up the thread local keys.
 // --------------------------------------------------------
 
+static void _mi_thread_done(mi_heap_t* default_heap);
+
 #ifdef __wasi__
 // no pthreads in the WebAssembly Standard Interface
 #elif !defined(_WIN32)
@@ -295,14 +293,14 @@ static bool _mi_heap_done(void) {
   #include <fibersapi.h>
   static DWORD mi_fls_key;
   static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(MI_USE_PTHREADS)
   // use pthread locol storage keys to detect thread ending
   #include <pthread.h>
   static pthread_key_t mi_pthread_key;
   static void mi_pthread_done(void* value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(__wasi__)
 // no pthreads in the WebAssembly Standard Interface
@@ -336,6 +334,8 @@ void mi_thread_init(void) mi_attr_noexcept
   mi_process_init();
 
   // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the 
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
   if (_mi_heap_init()) return;  // returns true if already initialized
 
   // don't further initialize for the main thread
@@ -343,33 +343,38 @@ void mi_thread_init(void) mi_attr_noexcept
 
   _mi_stat_increase(&mi_get_default_heap()->tld->stats.threads, 1);
 
-  // set hooks so our mi_thread_done() will be called
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    FlsSetValue(mi_fls_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_fls_done` is called
-  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_pthread_done` is called
-  #endif
-
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
   // stats
-  mi_heap_t* heap = mi_get_default_heap();
   if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
     _mi_stat_decrease(&heap->tld->stats.threads, 1);
   }
-
   // abandon the thread local heap
-  if (_mi_heap_done()) return; // returns true if already ran
-
-  //if (!_mi_is_main_thread()) {
-  //  _mi_verbose_message("thread done: 0x%zx\n", _mi_thread_id());
-  //}
+  if (_mi_heap_done(heap)) return; // returns true if already ran
 }
 
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  _mi_heap_default = heap;
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    FlsSetValue(mi_fls_key, heap); 
+  #elif defined(MI_USE_PTHREADS)
+    pthread_setspecific(mi_pthread_key, heap); 
+  #endif
+}
+
+
 
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@@ -450,7 +455,7 @@ void mi_process_init(void) mi_attr_noexcept {
   // access _mi_heap_default before setting _mi_process_is_initialized to ensure
   // that the TLS slot is allocated without getting into recursion on macOS
   // when using dynamic linking with interpose.
-  mi_heap_t* h = _mi_heap_default;
+  mi_heap_t* h = mi_get_default_heap();
   _mi_process_is_initialized = true;
 
   _mi_heap_main.thread_id = _mi_thread_id();
@@ -465,6 +470,7 @@ void mi_process_init(void) mi_attr_noexcept {
   #if (MI_DEBUG)
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
   mi_thread_init();
   mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
 }
diff --git a/src/memory.c b/src/memory.c
index 0ad582cd..dd03cf95 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -71,7 +71,7 @@ bool    _mi_os_is_huge_reserved(void* p);
 typedef uintptr_t mi_region_info_t;
 
 static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
-  return ((uintptr_t)start | ((is_large?1:0) << 1) | (is_committed?1:0));
+  return ((uintptr_t)start | ((uintptr_t)(is_large?1:0) << 1) | (is_committed?1:0));
 }
 
 static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, bool* is_committed) {
@@ -461,10 +461,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
     if (!is_large) {
       if (mi_option_is_enabled(mi_option_segment_reset)) {
-        _mi_os_reset(p, size, stats);  //
-        // _mi_os_decommit(p,size,stats); // if !is_eager_committed (and clear dirty bits)
+        if (!is_eager_committed &&  // cannot reset large pages
+          (mi_option_is_enabled(mi_option_eager_commit) ||  // cannot reset halfway committed segments, use `option_page_reset` instead
+            mi_option_is_enabled(mi_option_reset_decommits))) // but we can decommit halfway committed segments
+        {
+          _mi_os_reset(p, size, stats);
+          //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
+        }
       }
-      // else { _mi_os_reset(p,size,stats); }
     }    
     if (!is_eager_committed) {
       // adjust commit statistics as we commit again when re-using the same slot
diff --git a/src/options.c b/src/options.c
index 09524cb4..0bee74e0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -14,6 +14,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <ctype.h>  // toupper
 #include <stdarg.h>
 
+static uintptr_t mi_max_error_count = 16;  // stop outputting errors after this
+
+static void mi_add_stderr_output();
+
 int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
@@ -65,14 +69,17 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(cache_reset) },
   { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free
-  { 100, UNINIT, MI_OPTION(os_tag) }             // only apple specific for now but might serve more or less related purpose
+  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
 
 void _mi_options_init(void) {
-  // called on process load
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     mi_option_get(option); // initialize
@@ -81,6 +88,7 @@ void _mi_options_init(void) {
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
     }
   }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
 }
 
 long mi_option_get(mi_option_t option) {
@@ -134,12 +142,60 @@ static void mi_out_stderr(const char* msg) {
   #ifdef _WIN32
   // on windows with redirection, the C runtime cannot handle locale dependent output 
   // after the main thread closes so we use direct console output.
-  _cputs(msg);
+  if (!_mi_preloading()) { _cputs(msg); }
   #else
   fputs(msg, stderr);
   #endif
 }
 
+// Since an output function can be registered earliest in the `main`
+// function we also buffer output that happens earlier. When
+// an output function is registered it is called immediately with
+// the output up to that point.
+#ifndef MI_MAX_DELAY_OUTPUT
+#define MI_MAX_DELAY_OUTPUT (32*1024)
+#endif
+static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static _Atomic(uintptr_t) out_len;
+
+static void mi_out_buf(const char* msg) {
+  if (msg==NULL) return;
+  if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  size_t n = strlen(msg);
+  if (n==0) return;
+  // claim space
+  uintptr_t start = mi_atomic_addu(&out_len, n);
+  if (start >= MI_MAX_DELAY_OUTPUT) return;
+  // check bound
+  if (start+n >= MI_MAX_DELAY_OUTPUT) {
+    n = MI_MAX_DELAY_OUTPUT-start-1;
+  }
+  memcpy(&out_buf[start], msg, n);
+}
+
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {
+  if (out==NULL) return;
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  // and output the current contents
+  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+  out_buf[count] = 0;
+  out(out_buf);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline    
+  }
+}
+
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
+static void mi_out_buf_stderr(const char* msg) {
+  mi_out_stderr(msg);
+  mi_out_buf(msg);
+}
+
+
+
 // --------------------------------------------------------
 // Default output handler
 // --------------------------------------------------------
@@ -151,16 +207,22 @@ static mi_output_fun* volatile mi_out_default; // = NULL
 
 static mi_output_fun* mi_out_get_default(void) {
   mi_output_fun* out = mi_out_default;
-  return (out == NULL ? &mi_out_stderr : out);
+  return (out == NULL ? &mi_out_buf : out);
 }
 
 void mi_register_output(mi_output_fun* out) mi_attr_noexcept {
-  mi_out_default = out;
+  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  if (out!=NULL) mi_out_buf_flush(out,true);             // output all the delayed output now
 }
 
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_out_buf_flush(&mi_out_stderr, false); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;     // and add stderr to the delayed output
+}
 
 // --------------------------------------------------------
-// Messages
+// Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
 #define MAX_ERROR_COUNT (10)
 static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
@@ -170,7 +232,7 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 static mi_decl_thread bool recurse = false;
 
 void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) out = mi_out_get_default();
   recurse = true;
   if (prefix != NULL) out(prefix);
@@ -184,7 +246,7 @@ void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
 static void mi_vfprintf( mi_output_fun* out, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   recurse = true;
   vsnprintf(buf,sizeof(buf)-1,fmt,args);
   recurse = false;
@@ -217,7 +279,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 
 void _mi_error_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: error: ", fmt, args);
@@ -227,7 +289,7 @@ void _mi_error_message(const char* fmt, ...) {
 
 void _mi_warning_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: warning: ", fmt, args);
@@ -242,6 +304,16 @@ void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, co
 }
 #endif
 
+mi_attr_noreturn void _mi_fatal_error(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, "mimalloc: fatal: ", fmt, args);
+  va_end(args);
+  #if (MI_SECURE>=0)
+  abort();
+  #endif
+}
+
 // --------------------------------------------------------
 // Initialize options by checking the environment
 // --------------------------------------------------------
@@ -303,7 +375,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
     size_t len = strlen(s);
     if (len >= sizeof(buf)) len = sizeof(buf) - 1;
     for (size_t i = 0; i < len; i++) {
-      buf[i] = toupper(s[i]);
+      buf[i] = (char)toupper(s[i]);
     }
     buf[len] = 0;
     if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
diff --git a/src/os.c b/src/os.c
index 1aaceb1e..0aed771a 100644
--- a/src/os.c
+++ b/src/os.c
@@ -145,13 +145,13 @@ void _mi_os_init(void) {
   hDll = LoadLibrary(TEXT("kernelbase.dll"));
   if (hDll != NULL) {
     // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
-    pVirtualAlloc2 = (PVirtualAlloc2)GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)GetProcAddress(hDll, "VirtualAlloc2");
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
     FreeLibrary(hDll);
   }
   hDll = LoadLibrary(TEXT("ntdll.dll"));
   if (hDll != NULL) {    
-    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
     FreeLibrary(hDll);
   }  
   if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
@@ -317,7 +317,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
     p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
   }
   if (p == NULL) {
-    _mi_warning_message("unable to alloc mem error: err: %i size: 0x%x \n", GetLastError(), size);
+    _mi_warning_message("unable to allocate memory: error code: %i, addr: %p, size: 0x%x, large only: %d, allow_large: %d\n", GetLastError(), addr, size, large_only, allow_large);
   }
   return p;
 }
@@ -490,6 +490,7 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
   if (!commit) allow_large = false;
 
   void* p = NULL;
+  /*
   if (commit && allow_large) {
     p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
     if (p != NULL) {
@@ -497,6 +498,7 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
       return p;
     }
   }
+  */
 
   #if defined(_WIN32)
     int flags = MEM_RESERVE;
@@ -509,7 +511,7 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
     int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
     p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
   #endif
-  _mi_stat_increase(&stats->mmap_calls, 1);
+  mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
     if (commit) { _mi_stat_increase(&stats->committed, size); }
@@ -664,7 +666,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
   int err = 0;
   if (commit) {
     _mi_stat_increase(&stats->committed, csize);
-    _mi_stat_increase(&stats->commit_calls, 1);
+    _mi_stat_counter_increase(&stats->commit_calls, 1);
   }
   else {
     _mi_stat_decrease(&stats->committed, csize);
@@ -728,7 +730,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
   mi_assert_internal(p == start);
   #if 1
-  if (p == start) {
+  if (p == start && start != NULL) {
     VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
   }
   #endif
@@ -914,7 +916,7 @@ int mi_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reser
   uint8_t* start = (uint8_t*)((uintptr_t)32 << 40); // 32TiB virtual start address
   #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of huge pages unless in debug mode
   uintptr_t r = _mi_random_init((uintptr_t)&mi_reserve_huge_os_pages);
-  start = start + ((uintptr_t)MI_SEGMENT_SIZE * ((r>>17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
+  start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
   #endif
 
   // Allocate one page at the time but try to place them contiguously
diff --git a/src/page-queue.c b/src/page-queue.c
index d613095f..95443a69 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -57,7 +57,7 @@ static inline uint8_t mi_bsr32(uint32_t x);
 static inline uint8_t mi_bsr32(uint32_t x) {
   uint32_t idx;
   _BitScanReverse((DWORD*)&idx, x);
-  return idx;
+  return (uint8_t)idx;
 }
 #elif defined(__GNUC__) || defined(__clang__)
 static inline uint8_t mi_bsr32(uint32_t x) {
@@ -260,7 +260,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   page->heap->page_count--;
   page->next = NULL;
   page->prev = NULL;
-  page->heap = NULL;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
@@ -274,7 +274,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  page->heap = heap;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -338,7 +338,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   // set append pages to new heap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    page->heap = heap;
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
     count++;
   }
 
diff --git a/src/page.c b/src/page.c
index 25e59977..437cd0a5 100644
--- a/src/page.c
+++ b/src/page.c
@@ -161,14 +161,21 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   // return if the list is empty
   if (head == NULL) return;
 
-  // find the tail
+  // find the tail -- also to get a proper count (without data races)
+  uintptr_t max_count = page->capacity; // cannot collect more than capacity
   uintptr_t count = 1;
   mi_block_t* tail = head;
   mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL) {
+  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
     count++;
     tail = next;
   }
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_fatal_error("corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
   // and append the current local free list
   mi_block_set_next(page,tail, page->local_free);
   page->local_free = head;
@@ -192,7 +199,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       // usual case
       page->free = page->local_free;
       page->local_free = NULL;
-      page->flags.is_zero = false;
+      page->is_zero = false;
     }
     else if (force) {
       // append -- only on shutdown (force) as this is a linear operation
@@ -204,7 +211,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       mi_block_set_next(page, tail, page->free);
       page->free = page->local_free;
       page->local_free = NULL;
-      page->flags.is_zero = false;
+      page->is_zero = false;
     }
   }
 
@@ -276,7 +283,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
 
   // and free them all
   while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap->cookie,block);
+    mi_block_t* next = mi_block_nextx(heap,block, heap->cookie);
     // use internal free instead of regular one to keep stats etc correct
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
@@ -284,7 +291,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
       mi_block_t* dfree;
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie, block, dfree);
+        mi_block_set_nextx(heap, block, dfree, heap->cookie);
       } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
 
     }
@@ -336,18 +343,24 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(pq == mi_page_queue_of(page));
   mi_assert_internal(page->heap != NULL);
 
-  _mi_page_use_delayed_free(page,MI_NEVER_DELAYED_FREE);
+#if MI_DEBUG > 1
+  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+#endif
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
 #if MI_DEBUG>1
   // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)page->heap->thread_delayed_free; block != NULL; block = mi_block_nextx(page->heap->cookie,block)) {
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->cookie)) {
     mi_assert_internal(_mi_ptr_page(block) != page);
   }
 #endif
 
-  // and then remove from our page list
-  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
   // and abandon it
   mi_assert_internal(page->heap == NULL);
   _mi_segment_page_abandon(page,segments_tld);
@@ -370,6 +383,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_page_set_has_aligned(page, false);
 
   // account for huge pages here
+  // (note: no longer necessary as huge pages are always abandoned)
   if (page->block_size > MI_LARGE_OBJ_SIZE_MAX) {
     if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
       _mi_stat_decrease(&page->heap->tld->stats.giant, page->block_size);
@@ -378,7 +392,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
       _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
     }
   }
-
+  
   // remove from the page list
   // (no need to do _mi_heap_delayed_free first as all blocks are already free)
   mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
@@ -406,16 +420,18 @@ void _mi_page_retire(mi_page_t* page) {
   // (or we end up retiring and re-allocating most of the time)
   // NOTE: refine this more: we should not retire if this
   // is the only page left with free blocks. It is not clear
-  // how to check this efficiently though... for now we just check
-  // if its neighbours are almost fully used.
+  // how to check this efficiently though... 
+  // for now, we don't retire if it is the only page left of this size class.
+  mi_page_queue_t* pq = mi_page_queue_of(page);
   if (mi_likely(page->block_size <= (MI_SMALL_SIZE_MAX/4))) {
-    if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
+    // if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
+    if (pq->last==page && pq->first==page) {
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
       return; // dont't retire after all
     }
   }
 
-  _mi_page_free(page, mi_page_queue_of(page), false);
+  _mi_page_free(page, pq, false);
 }
 
 
@@ -429,13 +445,15 @@ void _mi_page_retire(mi_page_t* page) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) {
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
   UNUSED(stats);
+  #if (MI_SECURE<=2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
-  size_t bsize = page->block_size;
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  const size_t bsize = page->block_size;
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -443,8 +461,8 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   while ((extend >> shift) == 0) {
     shift--;
   }
-  size_t slice_count = (size_t)1U << shift;
-  size_t slice_extend = extend / slice_count;
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
   mi_assert_internal(slice_extend >= 1);
   mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
   size_t      counts[MI_MAX_SLICES];   // available objects in the slice
@@ -458,12 +476,12 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   // set up first element
   size_t current = _mi_heap_random(heap) % slice_count;
   counts[current]--;
-  page->free = blocks[current];
+  mi_block_t* const free_start = blocks[current];
   // and iterate through the rest
   uintptr_t rnd = heap->random;
   for (size_t i = 1; i < extend; i++) {
     // call random_shuffle only every INTPTR_SIZE rounds
-    size_t round = i%MI_INTPTR_SIZE;
+    const size_t round = i%MI_INTPTR_SIZE;
     if (round == 0) rnd = _mi_random_shuffle(rnd);
     // select a random next slice index
     size_t next = ((rnd >> 8*round) & (slice_count-1));
@@ -473,34 +491,39 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
     }
     // and link the current block to it
     counts[next]--;
-    mi_block_t* block = blocks[current];
+    mi_block_t* const block = blocks[current];
     blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
     mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
     current = next;
   }
-  mi_block_set_next(page, blocks[current], NULL);             // end of the list
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
   heap->random = _mi_random_shuffle(rnd);
 }
 
-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
 {
   UNUSED(stats);
+  #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
-  size_t bsize = page->block_size;
-  mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  const size_t bsize = page->block_size;
+  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
   
   // initialize a sequential free list
-  mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
+  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
   mi_block_t* block = start;
   while(block <= last) {
     mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
     mi_block_set_next(page,block,next);
     block = next;
   }  
-  mi_block_set_next(page, last, NULL);
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
   page->free = start;
 }
 
@@ -509,7 +532,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t e
 ----------------------------------------------------------- */
 
 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if MI_SECURE
+#if (MI_SECURE>0)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
 #define MI_MIN_EXTEND         (1)
@@ -522,15 +545,17 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t e
 // extra test in malloc? or cache effects?)
 static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* stats) {
   UNUSED(stats);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  mi_assert_expensive(mi_page_is_valid_init(page));
   if (page->free != NULL) return;
+  #endif
   if (page->capacity >= page->reserved) return;
 
   size_t page_size;
   _mi_page_start(_mi_page_segment(page), page, &page_size);
-  mi_stat_increase(stats->pages_extended, 1);
+  mi_stat_counter_increase(stats->pages_extended, 1);
 
   // calculate the extend count
   size_t extend = page->reserved - page->capacity;
@@ -559,7 +584,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* st
 
   // extension into zero initialized memory preserves the zero'd free list
   if (!page->is_zero_init) {
-    page->flags.is_zero = false;
+    page->is_zero = false;
   }
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
@@ -576,10 +601,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->block_size = block_size;
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
-  #if MI_SECURE
+  #ifdef MI_ENCODE_FREELIST
   page->cookie = _mi_heap_random(heap) | 1;
   #endif
-  page->flags.is_zero = page->is_zero_init;
+  page->is_zero = page->is_zero_init;
 
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
@@ -589,7 +614,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->next == NULL);
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(!mi_page_has_aligned(page));
-  #if MI_SECURE
+  #if (MI_ENCODE_FREELIST)
   mi_assert_internal(page->cookie != 0);
   #endif
   mi_assert_expensive(mi_page_is_valid_init(page));
@@ -736,7 +761,8 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
     mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
     mi_assert_internal(_mi_page_segment(page)->used==1);
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-    page->heap = NULL;
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
     if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
       _mi_stat_increase(&heap->tld->stats.giant, block_size);
       _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
diff --git a/src/stats.c b/src/stats.c
index 37a7bde4..50bd029d 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -95,15 +95,17 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 
   mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
   mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
-  mi_stat_add(&stats->mmap_calls, &src->mmap_calls, 1);
-  mi_stat_add(&stats->commit_calls, &src->commit_calls, 1);
   mi_stat_add(&stats->threads, &src->threads, 1);
-  mi_stat_add(&stats->pages_extended, &src->pages_extended, 1);
 
   mi_stat_add(&stats->malloc, &src->malloc, 1);
   mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
   mi_stat_add(&stats->huge, &src->huge, 1);
   mi_stat_add(&stats->giant, &src->giant, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
@@ -121,6 +123,9 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   Display statistics
 ----------------------------------------------------------- */
 
+// unit > 0 : size in binary bytes 
+// unit == 0: count as decimal
+// unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const char* fmt) {
   char buf[32];
   int  len = 32;
@@ -165,17 +170,24 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
       _mi_fprintf(out, "  ok\n");
   }
   else if (unit<0) {
-    mi_print_amount(stat->peak, 1, out);
-    mi_print_amount(stat->allocated, 1, out);
-    mi_print_amount(stat->freed, 1, out);
-    mi_print_amount(-unit, 1, out);
-    mi_print_count((stat->allocated / -unit), 0, out);
+    mi_print_amount(stat->peak, -1, out);
+    mi_print_amount(stat->allocated, -1, out);
+    mi_print_amount(stat->freed, -1, out);
+    if (unit==-1) {
+      _mi_fprintf(out, "%22s", "");
+    }
+    else {
+      mi_print_amount(-unit, 1, out);
+      mi_print_count((stat->allocated / -unit), 0, out);
+    }
     if (stat->allocated > stat->freed)
       _mi_fprintf(out, "  not all freed!\n");
     else
       _mi_fprintf(out, "  ok\n");
   }
   else {
+    mi_print_amount(stat->peak, 1, out);
+    mi_print_amount(stat->allocated, 1, out);
     _mi_fprintf(out, "\n");
   }
 }
@@ -247,11 +259,11 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
   mi_stat_print(&stats->segments_cache, "-cached", -1, out);
   mi_stat_print(&stats->pages, "pages", -1, out);
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out);
-  mi_stat_print(&stats->pages_extended, "-extended", 0, out);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out);
-  mi_stat_print(&stats->mmap_calls, "mmaps", 0, out);
-  mi_stat_print(&stats->commit_calls, "commits", 0, out);
-  mi_stat_print(&stats->threads, "threads", 0, out);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out);
+  mi_stat_print(&stats->threads, "threads", -1, out);
   mi_stat_counter_print_avg(&stats->searches, "searches", out);
 
   if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8bf36521..a80dde58 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,7 +13,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 # Import mimalloc (if installed)
-find_package(mimalloc 1.0 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
+find_package(mimalloc 1.2 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_TARGET_DIR}")
 
 # overriding with a dynamic library
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 6ddf4f37..b04bfeef 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -2,12 +2,23 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
+#include <stdint.h>
 
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.
 
+static void double_free1();
+static void double_free2();
+static void corrupt_free();
+
 int main() {
   mi_version();
+  
+  // detect double frees and heap corruption
+  // double_free1();
+  // double_free2();
+  // corrupt_free();
+
   void* p1 = malloc(78);
   void* p2 = malloc(24);
   free(p1);
@@ -29,3 +40,70 @@ int main() {
   mi_stats_print(NULL);
   return 0;
 }
+
+
+// The double free samples come ArcHeap [1] by Insu Yun (issue #161)
+// [1]: https://arxiv.org/pdf/1903.00503.pdf
+
+static void double_free1() {
+  void* p[256];
+  //uintptr_t buf[256];
+
+  p[0] = mi_malloc(622616);
+  p[1] = mi_malloc(655362);
+  p[2] = mi_malloc(786432);
+  mi_free(p[2]);
+  // [VULN] Double free
+  mi_free(p[2]);
+  p[3] = mi_malloc(786456);
+  // [BUG] Found overlap
+  // p[3]=0x429b2ea2000 (size=917504), p[1]=0x429b2e42000 (size=786432)
+  fprintf(stderr, "p3: %p-%p, p1: %p-%p, p2: %p\n", p[3], (uint8_t*)(p[3]) + 786456, p[1], (uint8_t*)(p[1]) + 655362, p[2]);
+}
+
+static void double_free2() {
+  void* p[256];
+  //uintptr_t buf[256];
+  // [INFO] Command buffer: 0x327b2000
+  // [INFO] Input size: 182
+  p[0] = malloc(712352);
+  p[1] = malloc(786432);
+  free(p[0]);
+  // [VULN] Double free
+  free(p[0]);
+  p[2] = malloc(786440);
+  p[3] = malloc(917504);
+  p[4] = malloc(786440);
+  // [BUG] Found overlap
+  // p[4]=0x433f1402000 (size=917504), p[1]=0x433f14c2000 (size=786432)
+  fprintf(stderr, "p1: %p-%p, p2: %p-%p\n", p[4], (uint8_t*)(p[4]) + 917504, p[1], (uint8_t*)(p[1]) + 786432);
+}
+
+
+// Try to corrupt the heap through buffer overflow
+#define N   256
+#define SZ  64
+
+static void corrupt_free() {
+  void* p[N];
+  // allocate
+  for (int i = 0; i < N; i++) {
+    p[i] = malloc(SZ);
+  }
+  // free some
+  for (int i = 0; i < N; i += (N/10)) {
+    free(p[i]);
+    p[i] = NULL;
+  }
+  // try to corrupt the free list
+  for (int i = 0; i < N; i++) {
+    if (p[i] != NULL) {
+      memset(p[i], 0, SZ+8);
+    }
+  }
+  // allocate more.. trying to trigger an allocation from a corrupted entry
+  // this may need many allocations to get there (if at all)
+  for (int i = 0; i < 4096; i++) {
+    malloc(SZ);
+  }
+}
\ No newline at end of file
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 2cafd2cd..e006ad27 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -2,6 +2,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
+#include <stdint.h>
 
 #include <mimalloc.h>
 #include <new>
@@ -41,7 +42,7 @@ int main() {
   p2 = malloc(16);
   p1 = realloc(p1, 32);
   free(p1);
-  mi_free(p2);
+  free(p2);
   mi_free(s);
   Test* t = new Test(42);
   delete t;
@@ -66,3 +67,5 @@ public:
 };
 
 static Static s = Static();
+
+
diff --git a/test/readme.md b/test/readme.md
index b74364ff..db3524cd 100644
--- a/test/readme.md
+++ b/test/readme.md
@@ -1,7 +1,7 @@
 Testing allocators is difficult as bugs may only surface after particular
 allocation patterns. The main approach to testing _mimalloc_ is therefore
 to have extensive internal invariant checking (see `page_is_valid` in `page.c`
-for example), which is enabled in debug mode with `-DMI_CHECK_FULL=ON`.
+for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`.
 The main testing strategy is then to run [`mimalloc-bench`][bench] using full
 invariant checking to catch any potential problems over a wide range of intensive
 allocation benchmarks and programs.
diff --git a/test/test-stress.c b/test/test-stress.c
index 354e2b07..6b2fb8c4 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -6,7 +6,8 @@ terms of the MIT license.
 
 /* This is a stress test for the allocator, using multiple threads and
    transferring objects between threads. This is not a typical workload
-   but uses a random linear size distribution. Do not use this test as a benchmark! 
+   but uses a random linear size distribution. Timing can also depend on
+   (random) thread scheduling. Do not use this test as a benchmark!
 */
 
 #include <stdio.h>
@@ -16,17 +17,35 @@ terms of the MIT license.
 #include <string.h>
 #include <mimalloc.h>
 
+// > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+//
 // argument defaults
-static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 20;    // scaling factor
+static int THREADS = 32;      // more repeatable if THREADS <= #processors
+static int SCALE   = 50;      // scaling factor
+static int ITER    = 10;      // N full iterations re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
-// static int N       = 100;  // scaling factor
+// static int SCALE   = 100;  // scaling factor
 
+static bool   allow_large_objects = true;    // allow very large objects?
+static size_t use_one_size = 0;              // use single object size of N uintptr_t?
+
+
+#ifdef USE_STD_MALLOC
+#define custom_malloc(s)      malloc(s)
+#define custom_realloc(p,s)   realloc(p,s)
+#define custom_free(p)        free(p)
+#else
+#define custom_malloc(s)      mi_malloc(s)
+#define custom_realloc(p,s)   mi_realloc(p,s)
+#define custom_free(p)        mi_free(p)
+#endif
+
+// transfer pointer between threads
 #define TRANSFERS     (1000)
-
 static volatile void* transfer[TRANSFERS];
 
+
 #if (UINTPTR_MAX != UINT32_MAX)
 const uintptr_t cookie = 0xbf58476d1ce4e5b9UL;
 #else
@@ -39,21 +58,21 @@ typedef uintptr_t* random_t;
 
 static uintptr_t pick(random_t r) {
   uintptr_t x = *r;
-  #if (UINTPTR_MAX > UINT32_MAX)
-    // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+#if (UINTPTR_MAX > UINT32_MAX)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
   x ^= x >> 30;
   x *= 0xbf58476d1ce4e5b9UL;
   x ^= x >> 27;
   x *= 0x94d049bb133111ebUL;
   x ^= x >> 31;
-  #else
-    // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+#else
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
   x ^= x >> 16;
   x *= 0x7feb352dUL;
   x ^= x >> 15;
   x *= 0x846ca68bUL;
   x ^= x >> 16;
-  #endif
+#endif
   *r = x;
   return x;
 }
@@ -63,10 +82,17 @@ static bool chance(size_t perc, random_t r) {
 }
 
 static void* alloc_items(size_t items, random_t r) {
-  if (chance(1, r)) items *= 100; // 1% huge objects;
-  if (items==40) items++;              // pthreads uses that size for stack increases
-  uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
-  for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  if (chance(1, r)) {
+    if (chance(1, r) && allow_large_objects) items *= 10000;       // 0.01% giant
+    else if (chance(10, r) && allow_large_objects) items *= 1000;  // 0.1% huge
+    else items *= 100;                                             // 1% large objects;
+  }
+  if (items == 40) items++;              // pthreads uses that size for stack increases
+  if (use_one_size > 0) items = (use_one_size / sizeof(uintptr_t));
+  uintptr_t* p = (uintptr_t*)custom_malloc(items * sizeof(uintptr_t));
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  }
   return p;
 }
 
@@ -75,42 +101,42 @@ static void free_items(void* p) {
     uintptr_t* q = (uintptr_t*)p;
     uintptr_t items = (q[0] ^ cookie);
     for (uintptr_t i = 0; i < items; i++) {
-      if ((q[i]^cookie) != items - i) {
+      if ((q[i] ^ cookie) != items - i) {
         fprintf(stderr, "memory corruption at block %p at %zu\n", p, i);
         abort();
       }
     }
   }
-  mi_free(p);
+  custom_free(p);
 }
 
 
 static void stress(intptr_t tid) {
   //bench_start_thread();
-  uintptr_t r = tid ^ 42;
-  const size_t max_item = 128;  // in words
-  const size_t max_item_retained = 10*max_item;
-  size_t allocs = 25*N*(tid%8 + 1); // some threads do more
-  size_t retain = allocs/2;
+  uintptr_t r = tid * 43;
+  const size_t max_item_shift = 5; // 128  
+  const size_t max_item_retained_shift = max_item_shift + 2;
+  size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more
+  size_t retain = allocs / 2;
   void** data = NULL;
   size_t data_size = 0;
   size_t data_top = 0;
-  void** retained = (void**)mi_malloc(retain*sizeof(void*));
+  void** retained = (void**)custom_malloc(retain * sizeof(void*));
   size_t retain_top = 0;
 
-  while (allocs>0 || retain>0) {
+  while (allocs > 0 || retain > 0) {
     if (retain == 0 || (chance(50, &r) && allocs > 0)) {
       // 50%+ alloc
       allocs--;
       if (data_top >= data_size) {
         data_size += 100000;
-        data = (void**)mi_realloc(data, data_size*sizeof(void*));
+        data = (void**)custom_realloc(data, data_size * sizeof(void*));
       }
-      data[data_top++] = alloc_items((pick(&r) % max_item) + 1, &r);
+      data[data_top++] = alloc_items( 1ULL << (pick(&r) % max_item_shift), &r);
     }
     else {
       // 25% retain
-      retained[retain_top++] = alloc_items(10*((pick(&r) % max_item_retained) + 1), &r);
+      retained[retain_top++] = alloc_items( 1ULL << (pick(&r) % max_item_retained_shift), &r);
       retain--;
     }
     if (chance(66, &r) && data_top > 0) {
@@ -120,7 +146,7 @@ static void stress(intptr_t tid) {
       data[idx] = NULL;
     }
     if (chance(25, &r) && data_top > 0) {
-      // 25% transfer-swap
+      // 25% exchange a local pointer with the (shared) transfer buffer.
       size_t data_idx = pick(&r) % data_top;
       size_t transfer_idx = pick(&r) % TRANSFERS;
       void* p = data[data_idx];
@@ -135,38 +161,54 @@ static void stress(intptr_t tid) {
   for (size_t i = 0; i < data_top; i++) {
     free_items(data[i]);
   }
-  mi_free(retained);
-  mi_free(data);
+  custom_free(retained);
+  custom_free(data);
   //bench_end_thread();
 }
 
 static void run_os_threads(size_t nthreads);
 
 int main(int argc, char** argv) {
-  if (argc>=2) {
+  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+  if (argc >= 2) {
     char* end;
     long n = strtol(argv[1], &end, 10);
     if (n > 0) THREADS = n;
   }
-  if (argc>=3) {
+  if (argc >= 3) {
     char* end;
     long n = (strtol(argv[2], &end, 10));
-    if (n > 0) N = n;
+    if (n > 0) SCALE = n;
   }
-  printf("start with %i threads with a %i%% load-per-thread\n", THREADS, N);  
+  if (argc >= 4) {
+    char* end;
+    long n = (strtol(argv[3], &end, 10));
+    if (n > 0) ITER = n;
+  }
+  printf("start with %d threads with a %d%% load-per-thread and %d iterations\n", THREADS, SCALE, ITER);
   //int res = mi_reserve_huge_os_pages(4,1);
   //printf("(reserve huge: %i\n)", res);
 
   //bench_start_program();
-  memset((void*)transfer, 0, TRANSFERS*sizeof(void*));
-  run_os_threads(THREADS);
-  for (int i = 0; i < TRANSFERS; i++) {
-    free_items((void*)transfer[i]);
+
+  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
+  mi_stats_reset();
+  uintptr_t r = 43 * 43;
+  for (int n = 0; n < ITER; n++) {
+    run_os_threads(THREADS);
+    for (int i = 0; i < TRANSFERS; i++) {
+      if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
+        void* p = atomic_exchange_ptr(&transfer[i], NULL);
+        free_items(p);
+      }
+    }
+    mi_collect(false);
+#ifndef NDEBUG
+    if ((n + 1) % 10 == 0) { printf("- iterations: %3d\n", n + 1); }
+#endif
   }
-  #ifndef NDEBUG
-  mi_collect(false);
+
   mi_collect(true);
-  #endif
   mi_stats_print(NULL);
   //bench_end_program();
   return 0;
@@ -183,22 +225,27 @@ static DWORD WINAPI thread_entry(LPVOID param) {
 }
 
 static void run_os_threads(size_t nthreads) {
-  DWORD* tids = (DWORD*)malloc(nthreads * sizeof(DWORD));
-  HANDLE* thandles = (HANDLE*)malloc(nthreads * sizeof(HANDLE));
+  DWORD* tids = (DWORD*)custom_malloc(nthreads * sizeof(DWORD));
+  HANDLE* thandles = (HANDLE*)custom_malloc(nthreads * sizeof(HANDLE));
   for (uintptr_t i = 0; i < nthreads; i++) {
     thandles[i] = CreateThread(0, 4096, &thread_entry, (void*)(i), 0, &tids[i]);
   }
   for (size_t i = 0; i < nthreads; i++) {
     WaitForSingleObject(thandles[i], INFINITE);
   }
+  for (size_t i = 0; i < nthreads; i++) {
+    CloseHandle(thandles[i]);
+  }
+  custom_free(tids);
+  custom_free(thandles);
 }
 
 static void* atomic_exchange_ptr(volatile void** p, void* newval) {
-  #if (INTPTR_MAX == UINT32_MAX)
+#if (INTPTR_MAX == UINT32_MAX)
   return (void*)InterlockedExchange((volatile LONG*)p, (LONG)newval);
-  #else
+#else
   return (void*)InterlockedExchange64((volatile LONG64*)p, (LONG64)newval);
-  #endif
+#endif
 }
 #else
 
@@ -211,8 +258,8 @@ static void* thread_entry(void* param) {
 }
 
 static void run_os_threads(size_t nthreads) {
-  pthread_t* threads = (pthread_t*)mi_malloc(nthreads*sizeof(pthread_t));
-  memset(threads, 0, sizeof(pthread_t)*nthreads);
+  pthread_t* threads = (pthread_t*)custom_malloc(nthreads * sizeof(pthread_t));
+  memset(threads, 0, sizeof(pthread_t) * nthreads);
   //pthread_setconcurrency(nthreads);
   for (uintptr_t i = 0; i < nthreads; i++) {
     pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
@@ -220,6 +267,7 @@ static void run_os_threads(size_t nthreads) {
   for (size_t i = 0; i < nthreads; i++) {
     pthread_join(threads[i], NULL);
   }
+  custom_free(threads);
 }
 
 static void* atomic_exchange_ptr(volatile void** p, void* newval) {