diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81cc339a..467fad95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,14 +6,13 @@ set(CMAKE_CXX_STANDARD 17)
 
 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
 option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
-option(MI_SEE_ASM           "Generate assembly files" OFF)
-option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
+option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
+option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
-option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
-
-set(mi_install_dir "lib/mimalloc-${mi_version}")
+option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 
 set(mi_sources
     src/stats.c
@@ -28,29 +27,33 @@ set(mi_sources
     src/options.c
     src/init.c)
 
-# Set default build type
+# -----------------------------------------------------------------------------
+# Converience: set default build type depending on the build directory
+# -----------------------------------------------------------------------------
+
 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$")
-    message(STATUS "No build type selected, default to *** Debug ***")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL MATCHES "ON")
+    message(STATUS "No build type selected, default to: Debug")
     set(CMAKE_BUILD_TYPE "Debug")
   else()
-    message(STATUS "No build type selected, default to *** Release ***")
+    message(STATUS "No build type selected, default to: Release")
     set(CMAKE_BUILD_TYPE "Release")
   endif()
-else()
-  message(STATUS "Build type specified as *** ${CMAKE_BUILD_TYPE} ***")
 endif()
 
 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
+  message(STATUS "Default to secure build")
   set(MI_SECURE "ON")
 endif()
 
+# -----------------------------------------------------------------------------
+# Process options
+# -----------------------------------------------------------------------------
+
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
   set(MI_USE_CXX "ON")
 endif()
 
-
-# Options
 if(MI_OVERRIDE MATCHES "ON")
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
@@ -67,8 +70,8 @@ if(MI_OVERRIDE MATCHES "ON")
 endif()
 
 if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=3)
+  message(STATUS "Set full secure build (MI_SECURE=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
 endif()
 
 if(MI_SEE_ASM MATCHES "ON")
@@ -77,7 +80,12 @@ if(MI_SEE_ASM MATCHES "ON")
 endif()
 
 if(MI_CHECK_FULL MATCHES "ON")
-  message(STATUS "Set debug level to full invariant checking (MI_CHECK_FULL=ON)")
+  message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead")
+  set(MI_DEBUG_FULL "ON")
+endif()
+
+if(MI_DEBUG_FULL MATCHES "ON")
+  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
   list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
 endif()
 
@@ -102,19 +110,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
   endif()
 endif()
 
-if(NOT(CMAKE_BUILD_TYPE MATCHES "Release|release|RelWithDebInfo|relwithdebinfo"))
-  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type)
-  set(mi_basename "mimalloc-${build_type}")
-else()
-  if(MI_SECURE MATCHES "ON")
-    set(mi_basename "mimalloc-secure")
-  else()
-    set(mi_basename "mimalloc")
-  endif()
-endif()
-message(STATUS "Output library name   : ${mi_basename}")
-message(STATUS "Installation directory: ${mi_install_dir}")
-
 # extra needed libraries
 if(WIN32)
   list(APPEND mi_libraries psapi shell32 user32)
@@ -127,9 +122,28 @@ else()
 endif()
 
 # -----------------------------------------------------------------------------
-# Main targets
+# Install and output names
 # -----------------------------------------------------------------------------
 
+set(mi_install_dir "${CMAKE_INSTALL_PREFIX}/lib/mimalloc-${mi_version}")
+if(MI_SECURE MATCHES "ON")
+  set(mi_basename "mimalloc-secure")
+else()
+  set(mi_basename "mimalloc")
+endif()
+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel)$"))
+  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
+endif()
+message(STATUS "")
+message(STATUS "Library base name: ${mi_basename}")
+message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+message(STATUS "Install directory: ${mi_install_dir}")
+message(STATUS "")
+
+# -----------------------------------------------------------------------------
+# Main targets
+# -----------------------------------------------------------------------------
 
 # shared library
 add_library(mimalloc SHARED ${mi_sources})
@@ -231,7 +245,7 @@ endif()
 if (MI_OVERRIDE MATCHES "ON")
   target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
   if(NOT WIN32)
-    # It is only possible to override malloc on Windows when building as a DLL. (src/alloc-override.c)
+    # It is only possible to override malloc on Windows when building as a DLL.
     target_compile_definitions(mimalloc-static PRIVATE MI_MALLOC_OVERRIDE)
     target_compile_definitions(mimalloc-obj PRIVATE MI_MALLOC_OVERRIDE)
   endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 79228c41..41d67f86 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -35,22 +35,32 @@ jobs:
         CC: gcc
         CXX: g++
         BuildType: debug
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
       Release:
         CC: gcc
         CXX: g++
         BuildType: release
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure:
+        CC: gcc
+        CXX: g++
+        BuildType: secure
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
       Debug Clang:
         CC: clang
         CXX: clang++
         BuildType: debug-clang
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_CHECK_FULL=ON
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
       Release Clang:
         CC: clang
         CXX: clang++
         BuildType: release-clang
         cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+      Secure Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: secure-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
 
   steps:
   - task: CMake@1
diff --git a/bin/mimalloc-redirect.dll b/bin/mimalloc-redirect.dll
index a1daf316..b7bf1d09 100644
Binary files a/bin/mimalloc-redirect.dll and b/bin/mimalloc-redirect.dll differ
diff --git a/bin/mimalloc-redirect.lib b/bin/mimalloc-redirect.lib
index 1e22ef12..550db8ec 100644
Binary files a/bin/mimalloc-redirect.lib and b/bin/mimalloc-redirect.lib differ
diff --git a/bin/mimalloc-redirect32.dll b/bin/mimalloc-redirect32.dll
index 52c27a2a..7ba303af 100644
Binary files a/bin/mimalloc-redirect32.dll and b/bin/mimalloc-redirect32.dll differ
diff --git a/bin/mimalloc-redirect32.lib b/bin/mimalloc-redirect32.lib
index c99aa32c..66173060 100644
Binary files a/bin/mimalloc-redirect32.lib and b/bin/mimalloc-redirect32.lib differ
diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 03316948..9d78b5a0 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,5 +1,5 @@
 set(mi_version_major 1)
-set(mi_version_minor 1)
+set(mi_version_minor 2)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/ide/vs2019/mimalloc-override.vcxproj.filters b/ide/vs2019/mimalloc-override.vcxproj.filters
new file mode 100644
index 00000000..bc1e4c60
--- /dev/null
+++ b/ide/vs2019/mimalloc-override.vcxproj.filters
@@ -0,0 +1,72 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Header Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\memory.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{f1fccf27-17b9-42dd-ba51-6070baff85c6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{39cb7e38-69d0-43fb-8406-6a0f7cefc3b4}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/ide/vs2019/mimalloc-test-stress.vcxproj b/ide/vs2019/mimalloc-test-stress.vcxproj
index afbb6666..ef7ab357 100644
--- a/ide/vs2019/mimalloc-test-stress.vcxproj
+++ b/ide/vs2019/mimalloc-test-stress.vcxproj
@@ -149,8 +149,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/ide/vs2019/mimalloc.vcxproj.filters b/ide/vs2019/mimalloc.vcxproj.filters
new file mode 100644
index 00000000..b2282df3
--- /dev/null
+++ b/ide/vs2019/mimalloc.vcxproj.filters
@@ -0,0 +1,75 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override-osx.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\memory.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-atomic.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{2b556b10-f559-4b2d-896e-142652adbf0c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{852a14ae-6dde-4e95-8077-ca705e97e5af}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index dff0f011..10368df3 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -220,7 +220,7 @@ static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x)
 #endif
 #elif defined(__wasi__)
   #include <sched.h>
-  static inline void mi_atomic_yield() {
+  static inline void mi_atomic_yield(void) {
     sched_yield();
   }
 #else
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index ccf12a06..452f0b68 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -101,6 +101,7 @@ uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD i
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 uintptr_t  _mi_heap_random(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
@@ -274,14 +275,20 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
   return segment;
 }
 
-// Get the page containing the pointer
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+// used internally
+static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
   // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
   ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
   mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
   uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
   mi_assert_internal(idx < segment->capacity);
   mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
   return &((mi_segment_t*)segment)->pages[idx];
 }
 
@@ -372,53 +379,67 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 
 // -------------------------------------------------------------------
 // Encoding/Decoding the free list next pointers
+// Note: we pass a `null` value to be used as the `NULL` value for the 
+// end of a free list. This is to prevent the cookie itself to ever 
+// be present among user blocks (as `cookie^0==cookie`).
 // -------------------------------------------------------------------
 
 static inline bool mi_is_in_same_segment(const void* p, const void* q) {
   return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
 }
 
-static inline mi_block_t* mi_block_nextx( uintptr_t cookie, const mi_block_t* block ) {
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  uintptr_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  uintptr_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t cookie ) {
   #ifdef MI_ENCODE_FREELIST
-  return (mi_block_t*)(block->next ^ cookie);
+  mi_block_t* b = (mi_block_t*)(block->next ^ cookie);
+  if (mi_unlikely((void*)b==null)) { b = NULL; }
+  return b;
   #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
   return (mi_block_t*)block->next;
   #endif
 }
 
-static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) {  
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t cookie) {
   #ifdef MI_ENCODE_FREELIST
+  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
   block->next = (mi_encoded_t)next ^ cookie;
   #else
-  UNUSED(cookie);
+  UNUSED(cookie); UNUSED(null);
   block->next = (mi_encoded_t)next;
   #endif
 }
 
 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
   #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page->cookie,block);
+  mi_block_t* next = mi_block_nextx(page,block,page->cookie);
   // check for free list corruption: is `next` at least in our segment range?
-  // TODO: it is better to check if it is actually inside our page but that is more expensive 
-  // to calculate. Perhaps with a relative free list this becomes feasible?
-  if (next!=NULL && !mi_is_in_same_segment(block, next)) {
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (next!=NULL && !mi_is_in_same_page(block, next)) {
     _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
     next = NULL;
   }   
   return next;
   #else
   UNUSED(page);
-  return mi_block_nextx(0, block);
+  return mi_block_nextx(page,block,0);
   #endif
 }
 
 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
   #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page->cookie,block,next);
+  mi_block_set_nextx(page,block,next, page->cookie);
   #else
   UNUSED(page);
-  mi_block_set_nextx(0, block, next);
+  mi_block_set_nextx(page,block, next,0);
   #endif
 }
 
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..96e1860f 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -26,7 +26,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 1  // guard page around metadata
 // #define MI_SECURE 2  // guard page around each mimalloc page
 // #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // experimental, may be more expensive: checks for double free.
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
 
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@@ -35,7 +35,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_DEBUG for debug mode
 // #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
 // #define MI_DEBUG 2  // + internal assertion checks
-// #define MI_DEBUG 3  // + extensive internal invariant checking
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
 #if !defined(NDEBUG) || defined(_DEBUG)
 #define MI_DEBUG 2
diff --git a/include/mimalloc.h b/include/mimalloc.h
index b63ed79d..7f26896c 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 110   // major + 2 digits minor
+#define MI_MALLOC_VERSION 120   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -271,6 +271,7 @@ typedef enum mi_option_e {
   mi_option_eager_commit_delay,
   mi_option_segment_reset,
   mi_option_os_tag,
+  mi_option_max_errors,
   _mi_option_last
 } mi_option_t;
 
diff --git a/readme.md b/readme.md
index 0d11db16..9d3974c9 100644
--- a/readme.md
+++ b/readme.md
@@ -1,7 +1,7 @@
 
 <img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
 
-[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=master"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
 
 # mimalloc
 
@@ -37,7 +37,7 @@ Notable aspects of the design include:
   programs.
 - __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
   randomized allocation, encrypted free lists, etc. to protect against various
-  heap vulnerabilities. The performance penalty is only around 3% on average
+  heap vulnerabilities. The performance penalty is usually around 10% on average
   over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
   A heap can be destroyed at once instead of deallocating each object separately.  
@@ -56,6 +56,7 @@ Enjoy!
 
 ### Releases
 
+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
 * 2019-10-07, `v1.1.0`: stable release 1.1.
 * 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
 * 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
@@ -64,7 +65,7 @@ Enjoy!
 
 ## Windows
 
-Open `ide/vs2017/mimalloc.sln` in Visual Studio 2017 and build.
+Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build (or `ide/vs2017/mimalloc.sln`).
 The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
@@ -97,7 +98,7 @@ maintains detailed statistics as:
 This will name the shared library as `libmimalloc-debug.so`.
 
 Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc, as:
+free lists, etc., as:
 ```
 > mkdir -p out/secure
 > cd out/secure
@@ -138,6 +139,9 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 ```
 to link with the static library. See `test\CMakeLists.txt` for an example.
 
+For best performance in C++ programs, it is also recommended to override the
+global `new` and `delete` operators. For convience, mimalloc provides
+[mimalloc-new-delete.h](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 
 You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`)
 and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
@@ -188,18 +192,18 @@ or via environment variables.
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
-- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly 
+- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages when available; for some workloads this can significantly
    improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
    to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
    can have fragmented memory.
 - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it 
-   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better 
+   show in the working set even though usually just a small part is committed to physical memory. This is why it
+   turned off by default on Windows as it looks not good in the task manager. However, in reality it is always better
    to turn it on as it improves performance and has no other drawbacks.
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB huge OS pages. This reserves the huge pages at
-   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use 
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving 
+   startup and can give quite a performance improvement on long running workloads. Usually it is better to not use
+   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented. Still experimental.
 
 [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
@@ -211,7 +215,7 @@ Overriding the standard `malloc` can be done either _dynamically_ or _statically
 
 ## Dynamic override
 
-This is the recommended way to override the standard malloc interface. 
+This is the recommended way to override the standard malloc interface.
 
 ### Linux, BSD
 
@@ -244,29 +248,31 @@ resolved to the _mimalloc_ library.
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
-Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this 
+Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is actively worked on to fix this
 (see issue [`#50`](https://github.com/microsoft/mimalloc/issues/50)).
 
 ### Windows
 
 On Windows you need to link your program explicitly with the mimalloc
-DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 
-Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available 
-in the same folder as the mimalloc DLL at runtime (as it as referred to by the mimalloc DLL). 
-The redirection DLL's ensure all calls to the C runtime malloc API get redirected to mimalloc. 
+DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+Moreover, you need to ensure the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) is available
+in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
+The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+mimalloc (in `mimalloc-override.dll`).
 
 To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()` 
+call to the mimalloc API in the `main` function, like `mi_version()`
 (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this.
+for an example on how to use this. For best performance on Windows with C++, it
+is highly recommended to also override the `new`/`delete` operations (as described
+in the introduction).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
 
-(Note: in principle, it should be possible to patch existing executables 
-that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the mimalloc DLL into
-the import table (and putting `mimalloc-redirect.dll` in the same folder) 
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)). 
+(Note: in principle, it is possible to patch existing executables
+that are linked with the dynamic C runtime (`ucrtbase.dll`) by just putting the `mimalloc-override.dll` into the import table (and putting `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
 
 
 ## Static override
@@ -282,6 +288,12 @@ object file. For example:
 > gcc -o myprogram mimalloc-override.o  myfile1.c ...
 ```
 
+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+
 
 # Performance
 
diff --git a/src/alloc.c b/src/alloc.c
index d2319f82..e68b48d2 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -157,7 +157,7 @@ static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, con
 }
 
 static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page->cookie, block); // pretend it is freed, and get the decoded first field
+  mi_block_t* n = mi_block_nextx(page, block, page->cookie); // pretend it is freed, and get the decoded first field
   if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&        // quick check: aligned pointer?
       (n==NULL || mi_is_in_same_segment(block, n)))    // quick check: in same segment or NULL?
   { 
@@ -235,14 +235,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   }
   else {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = page->heap;
+    mi_heap_t* heap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
       mi_block_t* dfree;
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie,block,dfree);
+        mi_block_set_nextx(heap,block,dfree, heap->cookie);
       } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
     }
 
diff --git a/src/heap.c b/src/heap.c
index 15c5d02a..daa9b241 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -223,7 +223,7 @@ static void mi_heap_free(mi_heap_t* heap) {
   
   // reset default
   if (mi_heap_is_default(heap)) {
-    _mi_heap_default = heap->tld->heap_backing;
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
   }
   // and free the used memory
   mi_free(heap);
@@ -354,8 +354,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
   mi_assert(mi_heap_is_initialized(heap));
   if (!mi_heap_is_initialized(heap)) return NULL;
   mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old   = _mi_heap_default;
-  _mi_heap_default = heap;
+  mi_heap_t* old = mi_get_default_heap(); 
+  _mi_heap_set_default_direct(heap);
   return old;
 }
 
diff --git a/src/init.c b/src/init.c
index e15d82eb..81413aa9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -90,6 +90,7 @@ const mi_heap_t _mi_heap_empty = {
   false
 };
 
+// the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 
@@ -198,8 +199,8 @@ static bool _mi_heap_init(void) {
   if (mi_heap_is_initialized(_mi_heap_default)) return true;
   if (_mi_is_main_thread()) {
     // the main heap is statically allocated
-    _mi_heap_default = &_mi_heap_main;
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == _mi_heap_default);
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
   }
   else {
     // use `_mi_os_alloc` to allocate directly from the OS
@@ -219,18 +220,17 @@ static bool _mi_heap_init(void) {
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->os.stats = &tld->stats;
-    _mi_heap_default = heap;
+    _mi_heap_set_default_direct(heap);
   }
   return false;
 }
 
 // Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_heap_done(void) {
-  mi_heap_t* heap = _mi_heap_default;
+static bool _mi_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
-  _mi_heap_default = (_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
 
   // todo: delete all non-backing heaps?
 
@@ -277,6 +277,8 @@ static bool _mi_heap_done(void) {
 // to set up the thread local keys.
 // --------------------------------------------------------
 
+static void _mi_thread_done(mi_heap_t* default_heap);
+
 #ifdef __wasi__
 // no pthreads in the WebAssembly Standard Interface
 #elif !defined(_WIN32)
@@ -291,14 +293,14 @@ static bool _mi_heap_done(void) {
   #include <fibersapi.h>
   static DWORD mi_fls_key;
   static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(MI_USE_PTHREADS)
   // use pthread locol storage keys to detect thread ending
   #include <pthread.h>
   static pthread_key_t mi_pthread_key;
   static void mi_pthread_done(void* value) {
-    if (value!=NULL) mi_thread_done();
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(__wasi__)
 // no pthreads in the WebAssembly Standard Interface
@@ -332,6 +334,8 @@ void mi_thread_init(void) mi_attr_noexcept
   mi_process_init();
 
   // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the 
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
   if (_mi_heap_init()) return;  // returns true if already initialized
 
   // don't further initialize for the main thread
@@ -339,33 +343,38 @@ void mi_thread_init(void) mi_attr_noexcept
 
   _mi_stat_increase(&mi_get_default_heap()->tld->stats.threads, 1);
 
-  // set hooks so our mi_thread_done() will be called
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    FlsSetValue(mi_fls_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_fls_done` is called
-  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_pthread_done` is called
-  #endif
-
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
   // stats
-  mi_heap_t* heap = mi_get_default_heap();
   if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
     _mi_stat_decrease(&heap->tld->stats.threads, 1);
   }
-
   // abandon the thread local heap
-  if (_mi_heap_done()) return; // returns true if already ran
-
-  //if (!_mi_is_main_thread()) {
-  //  _mi_verbose_message("thread done: 0x%zx\n", _mi_thread_id());
-  //}
+  if (_mi_heap_done(heap)) return; // returns true if already ran
 }
 
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  _mi_heap_default = heap;
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    FlsSetValue(mi_fls_key, heap); 
+  #elif defined(MI_USE_PTHREADS)
+    pthread_setspecific(mi_pthread_key, heap); 
+  #endif
+}
+
+
 
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@@ -446,7 +455,7 @@ void mi_process_init(void) mi_attr_noexcept {
   // access _mi_heap_default before setting _mi_process_is_initialized to ensure
   // that the TLS slot is allocated without getting into recursion on macOS
   // when using dynamic linking with interpose.
-  mi_heap_t* h = _mi_heap_default;
+  mi_heap_t* h = mi_get_default_heap();
   _mi_process_is_initialized = true;
 
   _mi_heap_main.thread_id = _mi_thread_id();
@@ -461,6 +470,7 @@ void mi_process_init(void) mi_attr_noexcept {
   #if (MI_DEBUG)
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
   mi_thread_init();
   mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
 }
diff --git a/src/options.c b/src/options.c
index a49c46ed..0bee74e0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -14,6 +14,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <ctype.h>  // toupper
 #include <stdarg.h>
 
+static uintptr_t mi_max_error_count = 16;  // stop outputting errors after this
+
+static void mi_add_stderr_output();
+
 int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
@@ -66,13 +70,16 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
-  { 100, UNINIT, MI_OPTION(os_tag) }             // only apple specific for now but might serve more or less related purpose
+  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
 
 void _mi_options_init(void) {
-  // called on process load
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     mi_option_get(option); // initialize
@@ -81,6 +88,7 @@ void _mi_options_init(void) {
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
     }
   }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
 }
 
 long mi_option_get(mi_option_t option) {
@@ -134,7 +142,7 @@ static void mi_out_stderr(const char* msg) {
   #ifdef _WIN32
   // on windows with redirection, the C runtime cannot handle locale dependent output 
   // after the main thread closes so we use direct console output.
-  _cputs(msg);
+  if (!_mi_preloading()) { _cputs(msg); }
   #else
   fputs(msg, stderr);
   #endif
@@ -165,23 +173,29 @@ static void mi_out_buf(const char* msg) {
   memcpy(&out_buf[start], msg, n);
 }
 
-static void mi_out_buf_flush(mi_output_fun* out) {
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {
   if (out==NULL) return;
-  // claim all (no more output will be added after this point)
-  size_t count = mi_atomic_addu(&out_len, MI_MAX_DELAY_OUTPUT);
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
   // and output the current contents
   if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
   out_buf[count] = 0;
   out(out_buf);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline    
+  }
 }
 
-// The initial default output, outputs to stderr and the delayed output buffer.
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
 static void mi_out_buf_stderr(const char* msg) {
   mi_out_stderr(msg);
   mi_out_buf(msg);
 }
 
 
+
 // --------------------------------------------------------
 // Default output handler
 // --------------------------------------------------------
@@ -193,14 +207,19 @@ static mi_output_fun* volatile mi_out_default; // = NULL
 
 static mi_output_fun* mi_out_get_default(void) {
   mi_output_fun* out = mi_out_default;
-  return (out == NULL ? &mi_out_buf_stderr : out);
+  return (out == NULL ? &mi_out_buf : out);
 }
 
 void mi_register_output(mi_output_fun* out) mi_attr_noexcept {
   mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  if (out!=NULL) mi_out_buf_flush(out);  // output the delayed output now
+  if (out!=NULL) mi_out_buf_flush(out,true);             // output all the delayed output now
 }
 
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_out_buf_flush(&mi_out_stderr, false); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;     // and add stderr to the delayed output
+}
 
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
@@ -213,7 +232,7 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 static mi_decl_thread bool recurse = false;
 
 void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) out = mi_out_get_default();
   recurse = true;
   if (prefix != NULL) out(prefix);
@@ -227,7 +246,7 @@ void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
 static void mi_vfprintf( mi_output_fun* out, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   recurse = true;
   vsnprintf(buf,sizeof(buf)-1,fmt,args);
   recurse = false;
@@ -260,7 +279,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 
 void _mi_error_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: error: ", fmt, args);
@@ -270,7 +289,7 @@ void _mi_error_message(const char* fmt, ...) {
 
 void _mi_warning_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: warning: ", fmt, args);
diff --git a/src/page-queue.c b/src/page-queue.c
index 4af70b50..95443a69 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -260,7 +260,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   page->heap->page_count--;
   page->next = NULL;
   page->prev = NULL;
-  page->heap = NULL;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
@@ -274,7 +274,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  page->heap = heap;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -338,7 +338,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   // set append pages to new heap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    page->heap = heap;
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
     count++;
   }
 
diff --git a/src/page.c b/src/page.c
index ab271309..437cd0a5 100644
--- a/src/page.c
+++ b/src/page.c
@@ -283,7 +283,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
 
   // and free them all
   while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap->cookie,block);
+    mi_block_t* next = mi_block_nextx(heap,block, heap->cookie);
     // use internal free instead of regular one to keep stats etc correct
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
@@ -291,7 +291,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
       mi_block_t* dfree;
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
-        mi_block_set_nextx(heap->cookie, block, dfree);
+        mi_block_set_nextx(heap, block, dfree, heap->cookie);
       } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
 
     }
@@ -343,18 +343,24 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(pq == mi_page_queue_of(page));
   mi_assert_internal(page->heap != NULL);
 
-  _mi_page_use_delayed_free(page,MI_NEVER_DELAYED_FREE);
+#if MI_DEBUG > 1
+  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+#endif
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
 #if MI_DEBUG>1
   // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)page->heap->thread_delayed_free; block != NULL; block = mi_block_nextx(page->heap->cookie,block)) {
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->cookie)) {
     mi_assert_internal(_mi_ptr_page(block) != page);
   }
 #endif
 
-  // and then remove from our page list
-  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
   // and abandon it
   mi_assert_internal(page->heap == NULL);
   _mi_segment_page_abandon(page,segments_tld);
@@ -439,13 +445,15 @@ void _mi_page_retire(mi_page_t* page) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) {
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
   UNUSED(stats);
+  #if (MI_SECURE<=2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
-  size_t bsize = page->block_size;
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  const size_t bsize = page->block_size;
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -453,8 +461,8 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   while ((extend >> shift) == 0) {
     shift--;
   }
-  size_t slice_count = (size_t)1U << shift;
-  size_t slice_extend = extend / slice_count;
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
   mi_assert_internal(slice_extend >= 1);
   mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
   size_t      counts[MI_MAX_SLICES];   // available objects in the slice
@@ -468,12 +476,12 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   // set up first element
   size_t current = _mi_heap_random(heap) % slice_count;
   counts[current]--;
-  page->free = blocks[current];
+  mi_block_t* const free_start = blocks[current];
   // and iterate through the rest
   uintptr_t rnd = heap->random;
   for (size_t i = 1; i < extend; i++) {
     // call random_shuffle only every INTPTR_SIZE rounds
-    size_t round = i%MI_INTPTR_SIZE;
+    const size_t round = i%MI_INTPTR_SIZE;
     if (round == 0) rnd = _mi_random_shuffle(rnd);
     // select a random next slice index
     size_t next = ((rnd >> 8*round) & (slice_count-1));
@@ -483,34 +491,39 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
     }
     // and link the current block to it
     counts[next]--;
-    mi_block_t* block = blocks[current];
+    mi_block_t* const block = blocks[current];
     blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
     mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
     current = next;
   }
-  mi_block_set_next(page, blocks[current], NULL);             // end of the list
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
   heap->random = _mi_random_shuffle(rnd);
 }
 
-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
 {
   UNUSED(stats);
+  #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
-  size_t bsize = page->block_size;
-  mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  const size_t bsize = page->block_size;
+  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
   
   // initialize a sequential free list
-  mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
+  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
   mi_block_t* block = start;
   while(block <= last) {
     mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
     mi_block_set_next(page,block,next);
     block = next;
   }  
-  mi_block_set_next(page, last, NULL);
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
   page->free = start;
 }
 
@@ -532,10 +545,12 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t e
 // extra test in malloc? or cache effects?)
 static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* stats) {
   UNUSED(stats);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  mi_assert_expensive(mi_page_is_valid_init(page));
   if (page->free != NULL) return;
+  #endif
   if (page->capacity >= page->reserved) return;
 
   size_t page_size;
@@ -746,7 +761,8 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
     mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
     mi_assert_internal(_mi_page_segment(page)->used==1);
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-    page->heap = NULL;
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
     if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
       _mi_stat_increase(&heap->tld->stats.giant, block_size);
       _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8bf36521..a80dde58 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,7 +13,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 # Import mimalloc (if installed)
-find_package(mimalloc 1.0 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
+find_package(mimalloc 1.2 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_TARGET_DIR}")
 
 # overriding with a dynamic library
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 19712411..b04bfeef 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -15,9 +15,9 @@ int main() {
   mi_version();
   
   // detect double frees and heap corruption
-  //double_free1();
-  //double_free2();
-  //corrupt_free();
+  // double_free1();
+  // double_free2();
+  // corrupt_free();
 
   void* p1 = malloc(78);
   void* p2 = malloc(24);
diff --git a/test/readme.md b/test/readme.md
index b74364ff..db3524cd 100644
--- a/test/readme.md
+++ b/test/readme.md
@@ -1,7 +1,7 @@
 Testing allocators is difficult as bugs may only surface after particular
 allocation patterns. The main approach to testing _mimalloc_ is therefore
 to have extensive internal invariant checking (see `page_is_valid` in `page.c`
-for example), which is enabled in debug mode with `-DMI_CHECK_FULL=ON`.
+for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`.
 The main testing strategy is then to run [`mimalloc-bench`][bench] using full
 invariant checking to catch any potential problems over a wide range of intensive
 allocation benchmarks and programs.
diff --git a/test/test-stress.c b/test/test-stress.c
index bb428072..6b2fb8c4 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -6,7 +6,8 @@ terms of the MIT license.
 
 /* This is a stress test for the allocator, using multiple threads and
    transferring objects between threads. This is not a typical workload
-   but uses a random linear size distribution. Do not use this test as a benchmark! 
+   but uses a random linear size distribution. Timing can also depend on
+   (random) thread scheduling. Do not use this test as a benchmark!
 */
 
 #include <stdio.h>
@@ -16,17 +17,35 @@ terms of the MIT license.
 #include <string.h>
 #include <mimalloc.h>
 
+// > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+//
 // argument defaults
-static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 20;    // scaling factor
+static int THREADS = 32;      // more repeatable if THREADS <= #processors
+static int SCALE   = 50;      // scaling factor
+static int ITER    = 10;      // N full iterations re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
-// static int N       = 100;  // scaling factor
+// static int SCALE   = 100;  // scaling factor
 
+static bool   allow_large_objects = true;    // allow very large objects?
+static size_t use_one_size = 0;              // use single object size of N uintptr_t?
+
+
+#ifdef USE_STD_MALLOC
+#define custom_malloc(s)      malloc(s)
+#define custom_realloc(p,s)   realloc(p,s)
+#define custom_free(p)        free(p)
+#else
+#define custom_malloc(s)      mi_malloc(s)
+#define custom_realloc(p,s)   mi_realloc(p,s)
+#define custom_free(p)        mi_free(p)
+#endif
+
+// transfer pointer between threads
 #define TRANSFERS     (1000)
-
 static volatile void* transfer[TRANSFERS];
 
+
 #if (UINTPTR_MAX != UINT32_MAX)
 const uintptr_t cookie = 0xbf58476d1ce4e5b9UL;
 #else
@@ -39,21 +58,21 @@ typedef uintptr_t* random_t;
 
 static uintptr_t pick(random_t r) {
   uintptr_t x = *r;
-  #if (UINTPTR_MAX > UINT32_MAX)
-    // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+#if (UINTPTR_MAX > UINT32_MAX)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
   x ^= x >> 30;
   x *= 0xbf58476d1ce4e5b9UL;
   x ^= x >> 27;
   x *= 0x94d049bb133111ebUL;
   x ^= x >> 31;
-  #else
-    // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+#else
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
   x ^= x >> 16;
   x *= 0x7feb352dUL;
   x ^= x >> 15;
   x *= 0x846ca68bUL;
   x ^= x >> 16;
-  #endif
+#endif
   *r = x;
   return x;
 }
@@ -63,10 +82,17 @@ static bool chance(size_t perc, random_t r) {
 }
 
 static void* alloc_items(size_t items, random_t r) {
-  if (chance(1, r)) items *= 100; // 1% huge objects;
-  if (items==40) items++;              // pthreads uses that size for stack increases
-  uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
-  for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  if (chance(1, r)) {
+    if (chance(1, r) && allow_large_objects) items *= 10000;       // 0.01% giant
+    else if (chance(10, r) && allow_large_objects) items *= 1000;  // 0.1% huge
+    else items *= 100;                                             // 1% large objects;
+  }
+  if (items == 40) items++;              // pthreads uses that size for stack increases
+  if (use_one_size > 0) items = (use_one_size / sizeof(uintptr_t));
+  uintptr_t* p = (uintptr_t*)custom_malloc(items * sizeof(uintptr_t));
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  }
   return p;
 }
 
@@ -75,42 +101,42 @@ static void free_items(void* p) {
     uintptr_t* q = (uintptr_t*)p;
     uintptr_t items = (q[0] ^ cookie);
     for (uintptr_t i = 0; i < items; i++) {
-      if ((q[i]^cookie) != items - i) {
+      if ((q[i] ^ cookie) != items - i) {
         fprintf(stderr, "memory corruption at block %p at %zu\n", p, i);
         abort();
       }
     }
   }
-  mi_free(p);
+  custom_free(p);
 }
 
 
 static void stress(intptr_t tid) {
   //bench_start_thread();
-  uintptr_t r = tid ^ 42;
-  const size_t max_item = 128;  // in words
-  const size_t max_item_retained = 10*max_item;
-  size_t allocs = 25*N*(tid%8 + 1); // some threads do more
-  size_t retain = allocs/2;
+  uintptr_t r = tid * 43;
+  const size_t max_item_shift = 5; // 128  
+  const size_t max_item_retained_shift = max_item_shift + 2;
+  size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more
+  size_t retain = allocs / 2;
   void** data = NULL;
   size_t data_size = 0;
   size_t data_top = 0;
-  void** retained = (void**)mi_malloc(retain*sizeof(void*));
+  void** retained = (void**)custom_malloc(retain * sizeof(void*));
   size_t retain_top = 0;
 
-  while (allocs>0 || retain>0) {
+  while (allocs > 0 || retain > 0) {
     if (retain == 0 || (chance(50, &r) && allocs > 0)) {
       // 50%+ alloc
       allocs--;
       if (data_top >= data_size) {
         data_size += 100000;
-        data = (void**)mi_realloc(data, data_size*sizeof(void*));
+        data = (void**)custom_realloc(data, data_size * sizeof(void*));
       }
-      data[data_top++] = alloc_items((pick(&r) % max_item) + 1, &r);
+      data[data_top++] = alloc_items( 1ULL << (pick(&r) % max_item_shift), &r);
     }
     else {
       // 25% retain
-      retained[retain_top++] = alloc_items(10*((pick(&r) % max_item_retained) + 1), &r);
+      retained[retain_top++] = alloc_items( 1ULL << (pick(&r) % max_item_retained_shift), &r);
       retain--;
     }
     if (chance(66, &r) && data_top > 0) {
@@ -120,7 +146,7 @@ static void stress(intptr_t tid) {
       data[idx] = NULL;
     }
     if (chance(25, &r) && data_top > 0) {
-      // 25% transfer-swap
+      // 25% exchange a local pointer with the (shared) transfer buffer.
       size_t data_idx = pick(&r) % data_top;
       size_t transfer_idx = pick(&r) % TRANSFERS;
       void* p = data[data_idx];
@@ -135,38 +161,54 @@ static void stress(intptr_t tid) {
   for (size_t i = 0; i < data_top; i++) {
     free_items(data[i]);
   }
-  mi_free(retained);
-  mi_free(data);
+  custom_free(retained);
+  custom_free(data);
   //bench_end_thread();
 }
 
 static void run_os_threads(size_t nthreads);
 
 int main(int argc, char** argv) {
-  if (argc>=2) {
+  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
+  if (argc >= 2) {
     char* end;
     long n = strtol(argv[1], &end, 10);
     if (n > 0) THREADS = n;
   }
-  if (argc>=3) {
+  if (argc >= 3) {
     char* end;
     long n = (strtol(argv[2], &end, 10));
-    if (n > 0) N = n;
+    if (n > 0) SCALE = n;
   }
-  printf("start with %i threads with a %i%% load-per-thread\n", THREADS, N);  
+  if (argc >= 4) {
+    char* end;
+    long n = (strtol(argv[3], &end, 10));
+    if (n > 0) ITER = n;
+  }
+  printf("start with %d threads with a %d%% load-per-thread and %d iterations\n", THREADS, SCALE, ITER);
   //int res = mi_reserve_huge_os_pages(4,1);
   //printf("(reserve huge: %i\n)", res);
 
   //bench_start_program();
+
+  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
   mi_stats_reset();
-  memset((void*)transfer, 0, TRANSFERS*sizeof(void*));
-  run_os_threads(THREADS);
-  for (int i = 0; i < TRANSFERS; i++) {
-    free_items((void*)transfer[i]);
+  uintptr_t r = 43 * 43;
+  for (int n = 0; n < ITER; n++) {
+    run_os_threads(THREADS);
+    for (int i = 0; i < TRANSFERS; i++) {
+      if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
+        void* p = atomic_exchange_ptr(&transfer[i], NULL);
+        free_items(p);
+      }
+    }
+    mi_collect(false);
+#ifndef NDEBUG
+    if ((n + 1) % 10 == 0) { printf("- iterations: %3d\n", n + 1); }
+#endif
   }
-  #ifndef NDEBUG
-  mi_collect(false);
-  #endif
+
+  mi_collect(true);
   mi_stats_print(NULL);
   //bench_end_program();
   return 0;
@@ -183,8 +225,8 @@ static DWORD WINAPI thread_entry(LPVOID param) {
 }
 
 static void run_os_threads(size_t nthreads) {
-  DWORD* tids = (DWORD*)malloc(nthreads * sizeof(DWORD));
-  HANDLE* thandles = (HANDLE*)malloc(nthreads * sizeof(HANDLE));
+  DWORD* tids = (DWORD*)custom_malloc(nthreads * sizeof(DWORD));
+  HANDLE* thandles = (HANDLE*)custom_malloc(nthreads * sizeof(HANDLE));
   for (uintptr_t i = 0; i < nthreads; i++) {
     thandles[i] = CreateThread(0, 4096, &thread_entry, (void*)(i), 0, &tids[i]);
   }
@@ -194,16 +236,16 @@ static void run_os_threads(size_t nthreads) {
   for (size_t i = 0; i < nthreads; i++) {
     CloseHandle(thandles[i]);
   }
-  free(tids);
-  free(thandles);
+  custom_free(tids);
+  custom_free(thandles);
 }
 
 static void* atomic_exchange_ptr(volatile void** p, void* newval) {
-  #if (INTPTR_MAX == UINT32_MAX)
+#if (INTPTR_MAX == UINT32_MAX)
   return (void*)InterlockedExchange((volatile LONG*)p, (LONG)newval);
-  #else
+#else
   return (void*)InterlockedExchange64((volatile LONG64*)p, (LONG64)newval);
-  #endif
+#endif
 }
 #else
 
@@ -216,8 +258,8 @@ static void* thread_entry(void* param) {
 }
 
 static void run_os_threads(size_t nthreads) {
-  pthread_t* threads = (pthread_t*)mi_malloc(nthreads*sizeof(pthread_t));
-  memset(threads, 0, sizeof(pthread_t)*nthreads);
+  pthread_t* threads = (pthread_t*)custom_malloc(nthreads * sizeof(pthread_t));
+  memset(threads, 0, sizeof(pthread_t) * nthreads);
   //pthread_setconcurrency(nthreads);
   for (uintptr_t i = 0; i < nthreads; i++) {
     pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
@@ -225,6 +267,7 @@ static void run_os_threads(size_t nthreads) {
   for (size_t i = 0; i < nthreads; i++) {
     pthread_join(threads[i], NULL);
   }
+  custom_free(threads);
 }
 
 static void* atomic_exchange_ptr(volatile void** p, void* newval) {