diff --git a/ide/vs2017/mimalloc-override.vcxproj b/ide/vs2017/mimalloc-override.vcxproj
index f41b2efc..7d452b55 100644
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@@ -35,7 +35,6 @@
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
@@ -46,7 +45,6 @@
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -70,25 +68,25 @@
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc</TargetName>
+    <TargetName>mimalloc-override</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc</TargetName>
+    <TargetName>mimalloc-override</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc</TargetName>
+    <TargetName>mimalloc-override</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc</TargetName>
+    <TargetName>mimalloc-override</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -100,15 +98,17 @@
       <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>Default</CompileAs>
     </ClCompile>
     <Link>
-      <AdditionalDependencies>../../bin/mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>
       </IgnoreSpecificDefaultLibraries>
       <ModuleDefinitionFile>
       </ModuleDefinitionFile>
       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <EntryPointSymbol>DllEntry</EntryPointSymbol>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -121,15 +121,17 @@
       <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>Default</CompileAs>
     </ClCompile>
     <Link>
-      <AdditionalDependencies>../../bin/mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>
       </IgnoreSpecificDefaultLibraries>
       <ModuleDefinitionFile>
       </ModuleDefinitionFile>
       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <EntryPointSymbol>DllEntry</EntryPointSymbol>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
     </Link>
     <PostBuildEvent>
       <Command>COPY /Y $(SolutionDir)..\..\bin\mimalloc-redirect.dll $(OutputPath)</Command>
@@ -152,15 +154,17 @@
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>Default</CompileAs>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>../../bin/mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
       <ModuleDefinitionFile>
       </ModuleDefinitionFile>
       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <EntryPointSymbol>DllEntry</EntryPointSymbol>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -177,15 +181,17 @@
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>Default</CompileAs>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>../../bin/mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
       <ModuleDefinitionFile>
       </ModuleDefinitionFile>
       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <EntryPointSymbol>DllEntry</EntryPointSymbol>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
     </Link>
     <PostBuildEvent>
       <Command>COPY /Y $(SolutionDir)..\..\bin\mimalloc-redirect.dll $(OutputPath)</Command>
@@ -208,6 +214,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override-win.c" />
     <ClCompile Include="..\..\src\alloc-override.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
diff --git a/ide/vs2017/mimalloc-override.vcxproj.filters b/ide/vs2017/mimalloc-override.vcxproj.filters
index ffabddac..df0bf5ed 100644
--- a/ide/vs2017/mimalloc-override.vcxproj.filters
+++ b/ide/vs2017/mimalloc-override.vcxproj.filters
@@ -67,5 +67,8 @@
     <ClCompile Include="..\..\src\alloc-posix.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override-win.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/ide/vs2017/mimalloc-test-stress.vcxproj b/ide/vs2017/mimalloc-test-stress.vcxproj
index e8cc5045..b8267d0b 100644
--- a/ide/vs2017/mimalloc-test-stress.vcxproj
+++ b/ide/vs2017/mimalloc-test-stress.vcxproj
@@ -67,19 +67,19 @@
   </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
diff --git a/ide/vs2017/mimalloc-test.vcxproj b/ide/vs2017/mimalloc-test.vcxproj
index c1539aeb..27c7bb6e 100644
--- a/ide/vs2017/mimalloc-test.vcxproj
+++ b/ide/vs2017/mimalloc-test.vcxproj
@@ -67,19 +67,19 @@
   </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</OutDir>
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index e261dba2..ad9b3ecf 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -307,13 +307,23 @@ static inline bool mi_page_all_used(mi_page_t* page) {
 static inline bool mi_page_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / 8U;
-  return (page->reserved - page->used + page->thread_freed < frac);
+  return (page->reserved - page->used + page->thread_freed <= frac);
 }
 
 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
   return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
 }
 
+static inline uintptr_t mi_page_thread_id(const mi_page_t* page) {
+  return (page->flags.xthread_id << MI_PAGE_FLAGS_BITS);
+}
+
+static inline void mi_page_init_flags(mi_page_t* page, uintptr_t thread_id) {
+  page->flags.value = 0;
+  page->flags.xthread_id = (thread_id >> MI_PAGE_FLAGS_BITS);
+  mi_assert(page->flags.value == thread_id);
+}
+
 // -------------------------------------------------------------------
 // Encoding/Decoding the free list next pointers
 // -------------------------------------------------------------------
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 5c14ffd4..4002c12c 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -91,19 +91,19 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
 #define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
 
-#define MI_MEDIUM_SIZE_MAX                (MI_MEDIUM_PAGE_SIZE/8)   // 64kb on 64-bit
-#define MI_LARGE_SIZE_MAX                 (MI_LARGE_PAGE_SIZE/8)    // 512kb on 64-bit
+#define MI_MEDIUM_SIZE_MAX                (MI_MEDIUM_PAGE_SIZE/4)   // 128kb on 64-bit
+#define MI_LARGE_SIZE_MAX                 (MI_LARGE_PAGE_SIZE/4)    // 1Mb on 64-bit
 #define MI_LARGE_WSIZE_MAX                (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT)
 
 
-// Maximum number of size classes. (spaced exponentially in 16.7% increments)
-#define MI_BIN_HUGE  (64U)
-
 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
 #define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
 
-#if (MI_LARGE_WSIZE_MAX > 131072)
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (70U)
+
+#if (MI_LARGE_WSIZE_MAX > 393216)
 #error "define more bins"
 #endif
 
@@ -123,14 +123,26 @@ typedef enum mi_delayed_e {
 } mi_delayed_t;
 
 
+// Use the lowest two bits of a thread id for the `in_full` and `has_aligned` flags
+// This allows a single test in `mi_free` to check for unlikely cases
+// (namely, non-local free, aligned free, or freeing in a full page)
+#define MI_PAGE_FLAGS_BITS         (2)
+#define MI_PAGE_FLAGS_TID_BITS (MI_INTPTR_SIZE*8 - MI_PAGE_FLAGS_BITS)
 typedef union mi_page_flags_u {
-  uint16_t value;
+  uintptr_t value;
   struct {
-    bool has_aligned;
-    bool in_full;
+    #ifdef MI_BIG_ENDIAN
+    uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS;
+    #endif
+    uintptr_t in_full : 1;
+    uintptr_t has_aligned : 1;
+    #ifndef MI_BIG_ENDIAN
+    uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS;
+    #endif
   };
 } mi_page_flags_t;
 
+
 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
 typedef uintptr_t mi_thread_free_t;
@@ -161,15 +173,15 @@ typedef struct mi_page_s {
   bool                  is_committed:1;    // `true` if the page virtual memory is committed
 
   // layout like this to optimize access in `mi_malloc` and `mi_free`
-  mi_page_flags_t       flags;
   uint16_t              capacity;          // number of blocks committed
   uint16_t              reserved;          // number of blocks reserved in memory
-  
+                                           // 16 bits padding
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   #if MI_SECURE
   uintptr_t             cookie;            // random cookie to encode the free lists
   #endif
   size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  mi_page_flags_t       flags;             // threadid:62 | has_aligned:1 | in_full:1
 
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   volatile uintptr_t    thread_freed;      // at least this number of blocks are in `thread_free`
@@ -182,10 +194,10 @@ typedef struct mi_page_s {
   struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
 
 // improve page index calculation
-#if MI_INTPTR_SIZE==8
-  //void*                 padding[1];        // 12 words on 64-bit
+#if (MI_INTPTR_SIZE==8 && MI_SECURE==0)
+  void*                 padding[1];        // 12 words on 64-bit
 #elif MI_INTPTR_SIZE==4
-  void*                 padding[1];         // 12 words on 32-bit
+  // void*                 padding[1];         // 12 words on 32-bit
 #endif
 } mi_page_t;
 
@@ -215,7 +227,7 @@ typedef struct mi_segment_s {
 
   // layout like this to optimize access in `mi_free`
   size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  uintptr_t       thread_id;   // unique id of the thread owning this segment
+  volatile uintptr_t thread_id;   // unique id of the thread owning this segment
   mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
   mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
@@ -324,12 +336,12 @@ typedef struct mi_stats_s {
   mi_stat_count_t pages_abandoned;
   mi_stat_count_t pages_extended;
   mi_stat_count_t mmap_calls;
-  mi_stat_count_t mmap_right_align;
-  mi_stat_count_t mmap_ensure_aligned;
   mi_stat_count_t commit_calls;
   mi_stat_count_t threads;
   mi_stat_count_t huge;
   mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t page_no_retire;
   mi_stat_counter_t searches;
 #if MI_STAT>1
   mi_stat_count_t normal[MI_BIN_HUGE+1];
diff --git a/src/alloc-override-win.c b/src/alloc-override-win.c
new file mode 100644
index 00000000..d1d51b9a
--- /dev/null
+++ b/src/alloc-override-win.c
@@ -0,0 +1,714 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#if !defined(_WIN32)
+#error "this file should only be included on Windows"
+#endif
+
+#include <windows.h>
+#include <psapi.h>
+
+#include <stdlib.h> // getenv
+#include <string.h> // strstr
+
+
+/*
+To override the C runtime `malloc` on Windows we need to patch the allocation
+functions at runtime initialization. Unfortunately we can never patch before the
+runtime initializes itself, because as soon as we call `GetProcAddress` on the
+runtime module (a DLL or EXE in Windows speak), it will first load and initialize
+(by the OS calling `DllMain` on it).
+
+This means that some things might be already allocated by the C runtime itself
+(and possibly other DLL's) before we get to resolve runtime adresses. This is
+no problem if everyone unwinds in order: when we unload, we unpatch and restore
+the original crt `free` routines and crt malloc'd memory is freed correctly.
+
+But things go wrong if such early CRT alloc'd memory is freed or re-allocated
+_after_ we patch, but _before_ we unload (and unpatch), or if any memory allocated
+by us is freed after we unpatched.
+
+There are two tricky situations to deal with:
+
+1. The Thread Local Storage (TLS): when the main thread stops it will call registered
+   callbacks on TLS entries (allocated by `FlsAlloc`). This is done by the OS
+   before any DLL's are unloaded. Unfortunately, the C runtime registers such
+   TLS entries with CRT allocated memory which is freed in the callback.
+
+2. Inside the CRT:
+   a. Some variables might get initialized by patched allocated
+      blocks but freed during CRT unloading after we unpatched
+      (like temporary file buffers).
+   b. Some blocks are allocated at CRT and freed by the CRT (like the
+      environment storage).
+   c. And some blocks are allocated by the CRT and then reallocated
+      while patched, and finally freed after unpatching! This
+      happens with the `atexit` functions for example to grow the array
+      of registered functions.
+
+In principle situation 2 is hopeless: since we cannot patch before CRT initialization,
+we can never be sure how to free or reallocate a pointer during CRT unloading.
+However, in practice there is a good solution: when terminating, we just patch
+the reallocation and free routines to no-ops -- we are winding down anyway! This leaves
+just the reallocation problm of CRT alloc'd memory once we are patched. Here, a study of the
+CRT reveals that there seem to be just three such situations:
+
+1. When registering `atexit` routines (to grow the exit function table),
+2. When calling `_setmaxstdio` (to grow the file handle table),
+3. and `_popen`/`_wpopen` (to grow handle pairs). These turn out not to be
+   a problem as these are NULL initialized.
+
+We fix these by providing wrappers:
+
+1. We first register a _global_ `atexit` routine ourselves (`mi_patches_at_exit`) before patching,
+   and then patch the `_crt_atexit` function to implement our own global exit list (and the
+   same for `_crt_at_quick_exit`). All module local lists are no problem since they are always fully
+   (un)patched from initialization to end. We can register in the global list by dynamically
+   getting the global `_crt_atexit` entry from `ucrtbase.dll`.
+
+2. The `_setmaxstdio`  is _detoured_: we patch it by a stub that unpatches first,
+   calls the original routine and repatches again.
+
+That leaves us to reliably shutdown and enter "termination mode":
+
+1. Using our trick to get the global exit list entry point, we register an exit function `mi_patches_atexit`
+   that first executes all our home brew list of exit functions, and then enters a _termination_
+   phase that patches realloc/free variants with no-ops. Patching later again with special no-ops for
+   `free` also improves efficiency during the program run since no flags need to be checked.
+
+2. That is not quite good enough yet since after executing exit routines after us on the
+   global exit list (registered by the CRT),
+   the OS starts to unwind the TLS callbacks and we would like to run callbacks registered after loading
+   our DLL to be done in patched mode. So, we also allocate a TLS entry when our DLL is loaded and when its
+   callback is called, we re-enable the original patches again. Since TLS is destroyed in FIFO order
+   this runs any callbacks in later DLL's in patched mode.
+
+3. Finally the DLL's get unloaded by the OS in order (still patched) until our DLL gets unloaded
+   and then we start a termination phase again, and patch realloc/free with no-ops for good this time.
+
+*/
+
+static int __cdecl mi_setmaxstdio(int newmax);
+
+// ------------------------------------------------------
+// Microsoft allocation extensions
+// ------------------------------------------------------
+
+
+typedef size_t mi_nothrow_t;
+
+static void mi_free_nothrow(void* p, mi_nothrow_t tag) {
+  UNUSED(tag);
+  mi_free(p);
+}
+
+// Versions of `free`, `realloc`, `recalloc`, `expand` and `msize`
+// that are used during termination and are no-ops.
+static void mi_free_term(void* p) {
+  UNUSED(p);
+}
+
+static void mi_free_size_term(void* p, size_t size) {
+  UNUSED(size);
+  UNUSED(p);
+}
+
+static void mi_free_nothrow_term(void* p, mi_nothrow_t tag) {
+  UNUSED(tag);
+  UNUSED(p);
+}
+
+static void* mi_realloc_term(void* p, size_t newsize) {
+  UNUSED(p); UNUSED(newsize);
+  return NULL;
+}
+
+static void* mi__recalloc_term(void* p, size_t newcount, size_t newsize) {
+  UNUSED(p); UNUSED(newcount); UNUSED(newsize);
+  return NULL;
+}
+
+static void* mi__expand_term(void* p, size_t newsize) {
+  UNUSED(p); UNUSED(newsize);
+  return NULL;
+}
+
+static size_t mi__msize_term(void* p) {
+  UNUSED(p);
+  return 0;
+}
+
+
+static void* mi__malloc_dbg(size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _malloc_base(size);
+}
+
+static void* mi__calloc_dbg(size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _calloc_base(count, size);
+}
+
+static void* mi__realloc_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _realloc_base(p, size);
+}
+
+static void mi__free_dbg(void* p, int block_type) {
+  UNUSED(block_type);
+  _free_base(p);
+}
+
+
+// the `recalloc`,`expand`, and `msize` don't have base versions and thus need a separate term version
+
+static void* mi__recalloc_dbg(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi_recalloc(p, count, size);
+}
+
+static void* mi__expand_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__expand(p, size);
+}
+
+static size_t mi__msize_dbg(void* p, int block_type) {
+  UNUSED(block_type);
+  return mi_usable_size(p);
+}
+
+static void* mi__recalloc_dbg_term(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__recalloc_term(p, count, size);
+}
+
+static void* mi__expand_dbg_term(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__expand_term(p, size);
+}
+
+static size_t mi__msize_dbg_term(void* p, int block_type) {
+  UNUSED(block_type);
+  return mi__msize_term(p);
+}
+
+
+// ------------------------------------------------------
+// implement our own global atexit handler
+// ------------------------------------------------------
+typedef void (cbfun_t)(void);
+typedef int (atexit_fun_t)(cbfun_t* fn);
+typedef uintptr_t encoded_t;
+
+typedef struct exit_list_s {
+  encoded_t  functions;  // encoded pointer to array of encoded function pointers
+  size_t     count;
+  size_t     capacity;
+} exit_list_t;
+
+#define MI_EXIT_INC (64)
+
+static exit_list_t atexit_list = { 0, 0, 0 };
+static exit_list_t at_quick_exit_list = { 0, 0, 0 };
+static CRITICAL_SECTION atexit_lock;
+
+// encode/decode function pointers with a random canary for security
+static encoded_t canary;
+
+static inline void *decode(encoded_t x) {
+  return (void*)(x^canary);
+}
+
+static inline encoded_t encode(void* p) {
+  return ((uintptr_t)p ^ canary);
+}
+
+
+static void init_canary()
+{
+  canary = _mi_random_init(0);
+  atexit_list.functions = at_quick_exit_list.functions = encode(NULL);
+}
+
+
+// initialize the list
+static void mi_initialize_atexit(void) {
+  InitializeCriticalSection(&atexit_lock);
+  init_canary();
+}
+
+// register an exit function
+static int mi_register_atexit(exit_list_t* list, cbfun_t* fn) {
+  if (fn == NULL) return EINVAL;
+  EnterCriticalSection(&atexit_lock);
+  encoded_t* functions = (encoded_t*)decode(list->functions);
+  if (list->count >= list->capacity) {   // at first `functions == decode(0) == NULL`
+    encoded_t* newf = (encoded_t*)mi_recalloc(functions, list->capacity + MI_EXIT_INC, sizeof(cbfun_t*));
+    if (newf != NULL) {
+      list->capacity += MI_EXIT_INC;
+      list->functions = encode(newf);
+      functions = newf;
+    }
+  }
+  int result;
+  if (list->count < list->capacity && functions != NULL) {
+    functions[list->count] = encode(fn);
+    list->count++;
+    result = 0; // success
+  }
+  else {
+    result = ENOMEM;
+  }
+  LeaveCriticalSection(&atexit_lock);
+  return result;
+}
+
+// Register a global `atexit` function
+static int mi_atexit(cbfun_t* fn) {
+  return mi_register_atexit(&atexit_list,fn);
+}
+
+static int mi_at_quick_exit(cbfun_t* fn) {
+  return mi_register_atexit(&at_quick_exit_list,fn);
+}
+
+static int mi_register_onexit(void* table, cbfun_t* fn) {
+  // TODO: how can we distinguish a quick_exit from atexit?
+  return mi_atexit(fn);
+}
+
+// Execute exit functions in a list
+static void mi_execute_exit_list(exit_list_t* list) {
+  // copy and zero the list structure
+  EnterCriticalSection(&atexit_lock);
+  exit_list_t clist = *list;
+  memset(list,0,sizeof(*list));
+  LeaveCriticalSection(&atexit_lock);
+
+  // now execute the functions outside of the lock
+  encoded_t* functions = (encoded_t*)decode(clist.functions);
+  if (functions != NULL) {
+    for (size_t i = clist.count; i > 0; i--) {  // careful with unsigned count down..
+      cbfun_t* fn = (cbfun_t*)decode(functions[i-1]);
+      if (fn==NULL) break; // corrupted!
+      fn();
+    }
+    mi_free(functions);
+  }
+}
+
+
+
+// ------------------------------------------------------
+// Jump assembly instructions for patches
+// ------------------------------------------------------
+
+#if defined(_M_IX86) || defined(_M_X64)
+
+#define MI_JUMP_SIZE  14   // at most 2+4+8 for a long jump or 1+5 for a short one
+
+typedef struct mi_jump_s {
+  uint8_t opcodes[MI_JUMP_SIZE];
+} mi_jump_t;
+
+void mi_jump_restore(void* current, const mi_jump_t* saved) {
+  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
+}
+
+void mi_jump_write(void* current, void* target, mi_jump_t* save) {
+  if (save != NULL) {
+    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
+  }
+  uint8_t*   opcodes = ((mi_jump_t*)current)->opcodes;
+  ptrdiff_t  diff    = (uint8_t*)target - (uint8_t*)current;
+  uint32_t   ofs32   = (uint32_t)diff;
+  #ifdef _M_X64
+  uint64_t   ofs64   = (uint64_t)diff;
+  if (ofs64 != (uint64_t)ofs32) {
+    // use long jump
+    opcodes[0] = 0xFF;
+    opcodes[1] = 0x25;
+    *((uint32_t*)&opcodes[2]) = 0;
+    *((uint64_t*)&opcodes[6]) = (uint64_t)target;
+  }
+  else
+  #endif
+  {
+    // use short jump
+    opcodes[0] = 0xE9;
+    *((uint32_t*)&opcodes[1]) = ofs32 - 5 /* size of the short jump instruction */;
+  }
+}
+
+#elif defined(_M_ARM64)
+
+#define MI_JUMP_SIZE  16
+
+typedef struct mi_jump_s {
+  uint8_t opcodes[MI_JUMP_SIZE];
+} mi_jump_t;
+
+void mi_jump_restore(void* current, const mi_jump_t* saved) {
+  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
+}
+
+void mi_jump_write(void* current, void* target, mi_jump_t* save) {
+  if (save != NULL) {
+    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
+  }
+  uint8_t*  opcodes = ((mi_jump_t*)current)->opcodes;
+  uint64_t  diff = (uint8_t*)target - (uint8_t*)current;
+
+  // 0x50 0x00 0x00 0x58   ldr x16, .+8   # load PC relative +8
+  // 0x00 0x02 0x3F 0xD6   blr x16        # and jump
+  //                       <address>
+  //                       <address>
+  static const uint8_t jump_opcodes[8] = { 0x50, 0x00, 0x00, 0x58, 0x00, 0x02, 0x3F, 0xD6 };
+  memcpy(&opcodes[0], jump_opcodes, sizeof(jump_opcodes));
+  *((uint64_t*)&opcodes[8]) = diff;
+}
+
+#else
+#error "define jump instructions for this platform"
+#endif
+
+
+// ------------------------------------------------------
+// Patches
+// ------------------------------------------------------
+typedef enum patch_apply_e {
+  PATCH_NONE,
+  PATCH_TARGET,
+  PATCH_TARGET_TERM
+} patch_apply_t;
+
+#define MAX_ENTRIES  4      // maximum number of patched entry points (like `malloc` in ucrtbase and msvcrt)
+
+typedef struct mi_patch_s {
+  const char*   name;                   // name of the function to patch
+  void*         target;                 // the address of the new target (never NULL)
+  void*         target_term;            // the address of the target during termination (or NULL)
+  patch_apply_t applied;                // what target has been applied?
+  void*         originals[MAX_ENTRIES]; // the resolved addresses of the function (or NULLs)
+  mi_jump_t     saves[MAX_ENTRIES];     // the saved instructions in case it was applied
+} mi_patch_t;
+
+#define MI_PATCH_NAME3(name,target,term)  { name, &target, &term, PATCH_NONE, {NULL,NULL,NULL,NULL} }
+#define MI_PATCH_NAME2(name,target)       { name, &target, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} }
+#define MI_PATCH3(name,target,term)       MI_PATCH_NAME3(#name, target, term)
+#define MI_PATCH2(name,target)            MI_PATCH_NAME2(#name, target)
+#define MI_PATCH1(name)                   MI_PATCH2(name,mi_##name)
+
+static mi_patch_t patches[] = {
+  // we implement our own global exit handler (as the CRT versions do a realloc internally)
+  //MI_PATCH2(_crt_atexit, mi_atexit),
+  //MI_PATCH2(_crt_at_quick_exit, mi_at_quick_exit),
+  MI_PATCH2(_setmaxstdio, mi_setmaxstdio),
+  MI_PATCH2(_register_onexit_function, mi_register_onexit),
+
+  // override higher level atexit functions so we can implement at_quick_exit correcty
+  MI_PATCH2(atexit, mi_atexit),
+  MI_PATCH2(at_quick_exit, mi_at_quick_exit),
+
+  // regular entries
+  MI_PATCH2(malloc, mi_malloc),
+  MI_PATCH2(calloc, mi_calloc),
+  MI_PATCH3(realloc, mi_realloc,mi_realloc_term),
+  MI_PATCH3(free, mi_free,mi_free_term),
+  
+  // extended api
+  MI_PATCH2(_strdup, mi_strdup),
+  MI_PATCH2(_strndup, mi_strndup),
+  MI_PATCH3(_expand, mi__expand,mi__expand_term),
+  MI_PATCH3(_recalloc, mi_recalloc,mi__recalloc_term),
+  MI_PATCH3(_msize, mi_usable_size,mi__msize_term),
+
+  // base versions 
+  MI_PATCH2(_malloc_base, mi_malloc),
+  MI_PATCH2(_calloc_base, mi_calloc),
+  MI_PATCH3(_realloc_base, mi_realloc,mi_realloc_term),
+  MI_PATCH3(_free_base, mi_free,mi_free_term),
+
+  // these base versions are in the crt but without import records
+  MI_PATCH_NAME3("_recalloc_base", mi_recalloc,mi__recalloc_term),
+  MI_PATCH_NAME3("_msize_base", mi_usable_size,mi__msize_term),
+
+  // debug
+  MI_PATCH2(_malloc_dbg, mi__malloc_dbg),
+  MI_PATCH2(_realloc_dbg, mi__realloc_dbg),
+  MI_PATCH2(_calloc_dbg, mi__calloc_dbg),
+  MI_PATCH2(_free_dbg, mi__free_dbg),
+
+  MI_PATCH3(_expand_dbg, mi__expand_dbg, mi__expand_dbg_term),
+  MI_PATCH3(_recalloc_dbg, mi__recalloc_dbg, mi__recalloc_dbg_term),
+  MI_PATCH3(_msize_dbg, mi__msize_dbg, mi__msize_dbg_term),
+
+#if 0
+  // override new/delete variants for efficiency (?)
+#ifdef _WIN64
+  // 64 bit new/delete
+  MI_PATCH_NAME2("??2@YAPEAX_K@Z", mi_new),
+  MI_PATCH_NAME2("??_U@YAPEAX_K@Z", mi_new),
+  MI_PATCH_NAME3("??3@YAXPEAX@Z", mi_free, mi_free_term),  
+  MI_PATCH_NAME3("??_V@YAXPEAX@Z", mi_free, mi_free_term), 
+  MI_PATCH_NAME3("??3@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized
+  MI_PATCH_NAME3("??_V@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized
+  MI_PATCH_NAME2("??2@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new),
+  MI_PATCH_NAME2("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new),
+  MI_PATCH_NAME3("??3@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
+  MI_PATCH_NAME3("??_V@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
+  
+  
+#else
+  // 32 bit new/delete
+  MI_PATCH_NAME2("??2@YAPAXI@Z", mi_new),
+  MI_PATCH_NAME2("??_U@YAPAXI@Z", mi_new),
+  MI_PATCH_NAME3("??3@YAXPAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??_V@YAXPAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??3@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized
+  MI_PATCH_NAME3("??_V@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized
+
+  MI_PATCH_NAME2("??2@YAPAXIABUnothrow_t@std@@@Z", mi_new),
+  MI_PATCH_NAME2("??_U@YAPAXIABUnothrow_t@std@@@Z", mi_new),
+  MI_PATCH_NAME3("??3@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
+  MI_PATCH_NAME3("??_V@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term),
+
+#endif
+#endif
+  { NULL, NULL, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} }
+};
+
+
+// Apply a patch
+static bool mi_patch_apply(mi_patch_t* patch, patch_apply_t apply)
+{
+  if (patch->originals[0] == NULL) return true;  // unresolved
+  if (apply == PATCH_TARGET_TERM && patch->target_term == NULL) apply = PATCH_TARGET;  // avoid re-applying non-term variants
+  if (patch->applied == apply) return false;
+
+  for (int i = 0; i < MAX_ENTRIES; i++) {
+    void* original = patch->originals[i];
+    if (original == NULL) break; // no more
+
+    DWORD protect = PAGE_READWRITE;
+    if (!VirtualProtect(original, MI_JUMP_SIZE, PAGE_EXECUTE_READWRITE, &protect)) return false;
+    if (apply == PATCH_NONE) {
+      mi_jump_restore(original, &patch->saves[i]);
+    }
+    else {
+      void* target = (apply == PATCH_TARGET ? patch->target : patch->target_term);
+      mi_assert_internal(target != NULL);
+      if (target != NULL) mi_jump_write(original, target, &patch->saves[i]);
+    }
+    VirtualProtect(original, MI_JUMP_SIZE, protect, &protect);
+  }
+  patch->applied = apply;
+  return true;
+}
+
+// Apply all patches
+static bool _mi_patches_apply(patch_apply_t apply, patch_apply_t* previous) {
+  static patch_apply_t current = PATCH_NONE;
+  if (previous != NULL) *previous = current;
+  if (current == apply) return true;
+  current = apply;
+  bool ok = true;
+  for (size_t i = 0; patches[i].name != NULL; i++) {
+    if (!mi_patch_apply(&patches[i], apply)) ok = false;
+  }
+  return ok;
+}
+
+// Export the following three functions just in case
+// a user needs that level of control.
+
+// Disable all patches
+mi_decl_export void mi_patches_disable(void) {
+  _mi_patches_apply(PATCH_NONE, NULL);
+}
+
+// Enable all patches normally
+mi_decl_export bool mi_patches_enable(void) {
+  return _mi_patches_apply( PATCH_TARGET, NULL );
+}
+
+// Enable all patches in termination phase where free is a no-op
+mi_decl_export bool mi_patches_enable_term(void) {
+  return _mi_patches_apply(PATCH_TARGET_TERM, NULL);
+}
+
+// ------------------------------------------------------
+// Stub for _setmaxstdio
+// ------------------------------------------------------
+
+static int __cdecl mi_setmaxstdio(int newmax) {
+  patch_apply_t previous;
+  _mi_patches_apply(PATCH_NONE, &previous); // disable patches
+  int result = _setmaxstdio(newmax);       // call original function (that calls original CRT recalloc)
+  _mi_patches_apply(previous,NULL);         // and re-enable patches
+  return result;
+}
+
+
+// ------------------------------------------------------
+// Resolve addresses dynamically
+// ------------------------------------------------------
+
+// Try to resolve patches for a given module (DLL)
+static void mi_module_resolve(const char* fname, HMODULE mod, int priority) {
+  // see if any patches apply
+  for (size_t i = 0; patches[i].name != NULL; i++) {
+    mi_patch_t* patch = &patches[i];
+    if (patch->applied == PATCH_NONE) {
+      // find an available entry
+      int i = 0;
+      while (i < MAX_ENTRIES && patch->originals[i] != NULL) i++;
+      if (i < MAX_ENTRIES) {
+        void* addr = GetProcAddress(mod, patch->name);
+        if (addr != NULL) {
+          // found it! set the address
+          patch->originals[i] = addr;          
+          _mi_trace_message("  found %s at %s!%p (entry %i)\n", patch->name, fname, addr, i);
+        }
+      }
+    }
+  }
+}
+
+#define MIMALLOC_NAME "mimalloc-override.dll"
+#define UCRTBASE_NAME "ucrtbase.dll"
+#define UCRTBASED_NAME "ucrtbased.dll"
+
+// Resolve addresses of all patches by inspecting the loaded modules
+static atexit_fun_t* crt_atexit = NULL;
+static atexit_fun_t* crt_at_quick_exit = NULL;
+
+
+static bool mi_patches_resolve(void) {
+  // get all loaded modules
+  HANDLE process = GetCurrentProcess(); // always -1, no need to release
+  DWORD needed = 0;
+  HMODULE modules[400];  // try to stay under 4k to not trigger the guard page
+  EnumProcessModules(process, modules, sizeof(modules), &needed);
+  if (needed == 0) return false;
+  int count = needed / sizeof(HMODULE);
+  int ucrtbase_index = 0;
+  int mimalloc_index = 0;
+  // iterate through the loaded modules
+  for (int i = 0; i < count;  i++) {
+    HMODULE mod = modules[i];
+    char filename[MAX_PATH] = { 0 };
+    DWORD slen = GetModuleFileName(mod, filename, MAX_PATH);
+    if (slen > 0 && slen < MAX_PATH) {
+      // filter out potential crt modules only
+      filename[slen] = 0;
+      const char* lastsep = strrchr(filename, '\\');
+      const char* basename = (lastsep==NULL ? filename : lastsep+1);
+      _mi_trace_message("  %i: dynamic module %s\n", i, filename);
+
+      // remember indices so we can check load order (in debug mode)
+      if (_stricmp(basename, MIMALLOC_NAME) == 0) mimalloc_index = i;
+      if (_stricmp(basename, UCRTBASE_NAME) == 0) ucrtbase_index = i;
+      if (_stricmp(basename, UCRTBASED_NAME) == 0) ucrtbase_index = i;
+
+      // see if we potentially patch in this module
+      int priority = 0; 
+      if (i == 0) priority = 2; // main module to allow static crt linking
+      else if (_strnicmp(basename, "ucrt", 4) == 0) priority = 3;   // new ucrtbase.dll in windows 10
+      // NOTE: don't override msvcr -- leads to crashes in setlocale (needs more testing)
+      // else if (_strnicmp(basename, "msvcr", 5) == 0) priority = 1;  // older runtimes      
+      
+      if (priority > 0) {
+        // probably found a crt module, try to patch it
+        mi_module_resolve(basename,mod,priority);
+
+        // try to find the atexit functions for the main process (in `ucrtbase.dll`)
+        if (crt_atexit==NULL) crt_atexit = (atexit_fun_t*)GetProcAddress(mod, "_crt_atexit");
+        if (crt_at_quick_exit == NULL) crt_at_quick_exit = (atexit_fun_t*)GetProcAddress(mod, "_crt_at_quick_exit");
+      }
+    }
+  }
+  int diff = mimalloc_index - ucrtbase_index;
+  if (diff > 1) {
+    _mi_warning_message("warning: the \"mimalloc-override\" DLL seems not to load before or right after the C runtime (\"ucrtbase\").\n"
+                        "  Try to fix this by changing the linking order.\n");
+  }
+  return true;
+}
+
+
+// ------------------------------------------------------
+// Dll Entry
+// ------------------------------------------------------
+
+extern BOOL WINAPI _DllMainCRTStartup(HINSTANCE inst, DWORD reason, LPVOID reserved);
+
+static DWORD mi_fls_unwind_entry;
+static void NTAPI mi_fls_unwind(PVOID value) {
+  if (value != NULL) mi_patches_enable();   // and re-enable normal patches again for DLL's loaded after us
+  return;
+}
+
+static void mi_patches_atexit(void) {
+  mi_execute_exit_list(&atexit_list);
+  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
+}
+
+static void mi_patches_at_quick_exit(void) {
+  mi_execute_exit_list(&at_quick_exit_list);
+  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
+}
+
+__declspec(dllexport) BOOL WINAPI DllEntry(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+  if (reason == DLL_PROCESS_ATTACH) {
+    __security_init_cookie();
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    // enter termination phase for good now
+    mi_patches_enable_term();
+  }
+  // C runtime main
+  BOOL ok = _DllMainCRTStartup(inst, reason, reserved);  
+  if (reason == DLL_PROCESS_ATTACH && ok) {
+    // initialize at exit lists
+    mi_initialize_atexit();
+
+    // Now resolve patches
+    ok = mi_patches_resolve();
+    if (ok) {
+      // check if patching is not disabled
+      #pragma warning(suppress:4996)
+      const char* s = getenv("MIMALLOC_DISABLE_OVERRIDE");
+      bool enabled = (s == NULL || !(strstr("1;TRUE;YES;ON", s) != NULL));
+      if (!enabled) {
+        _mi_verbose_message("override is disabled\n");
+      }
+      else {        
+        // and register our unwind entry (this must be after resolving due to possible delayed DLL initialization from GetProcAddress)
+        mi_fls_unwind_entry = FlsAlloc(&mi_fls_unwind);
+        if (mi_fls_unwind_entry != FLS_OUT_OF_INDEXES) {
+          FlsSetValue(mi_fls_unwind_entry, (void*)1);
+        }
+
+        // register our patch disabler in the global exit list
+        if (crt_atexit != NULL)        (*crt_atexit)(&mi_patches_atexit);
+        if (crt_at_quick_exit != NULL) (*crt_at_quick_exit)(&mi_patches_at_quick_exit);
+
+        // and patch !  this also redirects the `atexit` handling for the global exit list
+        mi_patches_enable();
+        _mi_verbose_message("override is enabled\n");
+
+        // hide internal allocation
+        mi_stats_reset();
+      }
+    }
+  }
+  return ok;
+}
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 345d396c..e5eeaab2 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -9,7 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should be included from 'alloc.c' (so aliases can work)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL) && defined(_WIN64))
+#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
 #error "It is only possible to override "malloc" on Windows when building as a 64-bit DLL (and linking the C runtime as a DLL)"
 #endif
 
diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index 1f55b3a8..672b73b3 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -38,7 +38,9 @@ size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
 }
 
 void mi_cfree(void* p) mi_attr_noexcept {
-  mi_free(p);
+  if (mi_is_in_heap_region(p)) {
+    mi_free(p);
+  }
 }
 
 int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept {
diff --git a/src/alloc.c b/src/alloc.c
index 6a91c0ad..bfb37d19 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -57,6 +57,7 @@ extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept {
   return mi_heap_malloc_small(mi_get_default_heap(), size);
 }
 
+
 // zero initialized small block
 void* mi_zalloc_small(size_t size) mi_attr_noexcept {
   void* p = mi_malloc_small(size);
@@ -71,7 +72,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep
   void* p;
   if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
     p = mi_heap_malloc_small(heap, size);
-  }
+  }  
   else {
     p = _mi_malloc_generic(heap, size);
   }
@@ -223,8 +224,7 @@ void mi_free(void* p) mi_attr_noexcept
     return;
   }
 #endif
-
-  bool local = (_mi_thread_id() == segment->thread_id);  // preload, note: putting the thread_id in the page->flags does not improve performance
+  
   mi_page_t* page = _mi_segment_page_of(segment, p);
 
 #if (MI_STAT>1)
@@ -236,24 +236,18 @@ void mi_free(void* p) mi_attr_noexcept
   // huge page stat is accounted for in `_mi_page_retire`
 #endif
 
-  // adjust if it might be an un-aligned block
-  if (mi_likely(page->flags.value==0)) {  // not full or aligned
+  uintptr_t tid = _mi_thread_id();
+  if (mi_likely(tid == page->flags.value)) {  
+    // local, and not full or aligned
     mi_block_t* block = (mi_block_t*)p;
-    if (mi_likely(local)) {  // note: merging both tests (local | value) does not matter for performance
-      // owning thread can free a block directly
-      mi_block_set_next(page, block, page->local_free);  // note: moving this write earlier does not matter for performance
-      page->local_free = block;
-      page->used--;
-      if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }
-    }
-    else {
-      // use atomic operations for a multi-threaded free
-      _mi_free_block_mt(page, block);      
-    }
+    mi_block_set_next(page, block, page->local_free);  
+    page->local_free = block;
+    page->used--;
+    if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }
   }
   else {
-    // aligned blocks, or a full page; use the more generic path
-    mi_free_generic(segment, page, local, p);
+    // non-local, aligned blocks, or a full page; use the more generic path
+    mi_free_generic(segment, page, tid == mi_page_thread_id(page), p);
   }
 }
 
diff --git a/src/init.c b/src/init.c
index 44e3c9cb..f807d74a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -12,15 +12,16 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false, {0}, 0, 0, 
-  NULL, 0,   // free, used
+  0, false, false, false, 0, 0, 
+  NULL,    // free
   #if MI_SECURE
   0,
   #endif
+  0, {0}, // used, flags
   NULL, 0, 0,
   0, NULL, NULL, NULL
-  #if (MI_INTPTR_SIZE==4)
-  , { NULL }
+  #if (MI_INTPTR_SIZE==8 && MI_SECURE==0)
+  , { NULL } 
   #endif
 };
 
@@ -33,22 +34,23 @@ const mi_page_t _mi_page_empty = {
 #define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
 #define MI_PAGE_QUEUES_EMPTY \
   { QNULL(1), \
-    QNULL(1), QNULL(2), QNULL(3), QNULL(4), QNULL(5), QNULL(6), QNULL(7), QNULL(8), \
-    QNULL(10), QNULL(12), QNULL(14), QNULL(16), QNULL(20), QNULL(24), QNULL(28), QNULL(32), \
-    QNULL(40), QNULL(48), QNULL(56), QNULL(64), QNULL(80), QNULL(96), QNULL(112), QNULL(128), \
-    QNULL(160), QNULL(192), QNULL(224), QNULL(256), QNULL(320), QNULL(384), QNULL(448), QNULL(512), \
-    QNULL(640), QNULL(768), QNULL(896), QNULL(1024), QNULL(1280), QNULL(1536), QNULL(1792), QNULL(2048), \
-    QNULL(2560), QNULL(3072), QNULL(3584), QNULL(4096), QNULL(5120), QNULL(6144), QNULL(7168), QNULL(8192), \
-    QNULL(10240), QNULL(12288), QNULL(14336), QNULL(16384), QNULL(20480), QNULL(24576), QNULL(28672), QNULL(32768), \
-    QNULL(40960), QNULL(49152), QNULL(57344), QNULL(65536), QNULL(81920), QNULL(98304), QNULL(114688), \
-    QNULL(MI_LARGE_WSIZE_MAX + 1  /*131072, Huge queue */), \
+    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
+    QNULL(    10), QNULL(    12), QNULL(    14), QNULL(    16), QNULL(    20), QNULL(    24), QNULL(    28), QNULL(    32), /* 16 */ \
+    QNULL(    40), QNULL(    48), QNULL(    56), QNULL(    64), QNULL(    80), QNULL(    96), QNULL(   112), QNULL(   128), /* 24 */ \
+    QNULL(   160), QNULL(   192), QNULL(   224), QNULL(   256), QNULL(   320), QNULL(   384), QNULL(   448), QNULL(   512), /* 32 */ \
+    QNULL(   640), QNULL(   768), QNULL(   896), QNULL(  1024), QNULL(  1280), QNULL(  1536), QNULL(  1792), QNULL(  2048), /* 40 */ \
+    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
+    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
+    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), /* 69 */ \
+    QNULL(MI_LARGE_WSIZE_MAX + 1  /* 393216, Huge queue */), \
     QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ }
 
 #define MI_STAT_COUNT_NULL()  {0,0,0,0}
 
 // Empty statistics
 #if MI_STAT>1
-#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT64(MI_STAT_COUNT_NULL) }
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
 #else
 #define MI_STAT_COUNT_END_NULL()
 #endif
@@ -61,7 +63,8 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, \
   { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
@@ -95,8 +98,8 @@ static mi_tld_t tld_main = {
   0,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, NULL, NULL, 0, tld_main_stats },              // os
-  { MI_STATS_NULL }                                  // stats
+  { 0, NULL, NULL, 0, tld_main_stats },          // os
+  { MI_STATS_NULL }                              // stats
 };
 
 mi_heap_t _mi_heap_main = {
@@ -385,7 +388,7 @@ bool _mi_preloading() {
 }
 
 // Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) 
+#if 0
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/memory.c b/src/memory.c
index e7d1887e..7f8cfb14 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -106,6 +106,7 @@ static size_t mi_good_commit_size(size_t size) {
 
 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  if (p==NULL) return false;
   size_t count = mi_atomic_read(&regions_count);
   for (size_t i = 0; i < count; i++) {
     uint8_t* start = (uint8_t*)mi_atomic_read_ptr(&regions[i].start);
diff --git a/src/page-queue.c b/src/page-queue.c
index fd388113..a386f8a1 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -97,7 +97,7 @@ uint8_t _mi_bsr(uintptr_t x) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-inline uint8_t _mi_bin(size_t size) {
+extern inline uint8_t _mi_bin(size_t size) {
   size_t wsize = _mi_wsize_from_size(size);
   uint8_t bin;
   if (wsize <= 1) {
@@ -120,13 +120,13 @@ inline uint8_t _mi_bin(size_t size) {
     bin = MI_BIN_HUGE;
   }
   else {
-    #if defined(MI_ALIGN4W)
+    #if defined(MI_ALIGN4W) 
     if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
     #endif
     wsize--;
     // find the highest bit
     uint8_t b = mi_bsr32((uint32_t)wsize);
-    // and use the top 3 bits to determine the bin (~16% worst internal fragmentation).
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
     bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
diff --git a/src/page.c b/src/page.c
index 69d32bfe..e6be8df6 100644
--- a/src/page.c
+++ b/src/page.c
@@ -71,10 +71,11 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
-
+  
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = _mi_page_start(segment,page,NULL);
   mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL));
+  mi_assert_internal(segment->thread_id==0 || segment->thread_id == mi_page_thread_id(page));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -216,7 +217,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_page_t* page = _mi_segment_page_alloc(block_size, &heap->tld->segments, &heap->tld->os);
   if (page == NULL) return NULL;
   mi_page_init(heap, page, block_size, &heap->tld->stats);
-  mi_heap_stat_increase( heap, pages, 1);
+  _mi_stat_increase( &heap->tld->stats.pages, 1);
   mi_page_queue_push(heap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
@@ -352,7 +353,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 
   // account for huge pages here
   if (page->block_size > MI_LARGE_SIZE_MAX) {
-    mi_heap_stat_decrease(page->heap, huge, page->block_size);
+    _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
   }
 
   // remove from the page list
@@ -384,8 +385,9 @@ void _mi_page_retire(mi_page_t* page) {
   // is the only page left with free blocks. It is not clear
   // how to check this efficiently though... for now we just check
   // if its neighbours are almost fully used.
-  if (mi_likely(page->block_size <= MI_SMALL_SIZE_MAX)) {
+  if (mi_likely(page->block_size <= MI_MEDIUM_SIZE_MAX)) {
     if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
+      _mi_stat_counter_increase(&_mi_stats_main.page_no_retire,1);
       return; // dont't retire after all
     }
   }
@@ -404,7 +406,60 @@ void _mi_page_retire(mi_page_t* page) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats)
+static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  size_t bsize = page->block_size;
+
+  // initialize a randomized free list
+  // set up `slice_count` slices to alternate between
+  size_t shift = MI_MAX_SLICE_SHIFT;
+  while ((extend >> shift) == 0) {
+    shift--;
+  }
+  size_t slice_count = (size_t)1U << shift;
+  size_t slice_extend = extend / slice_count;
+  mi_assert_internal(slice_extend >= 1);
+  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+  for (size_t i = 0; i < slice_count; i++) {
+    blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend);
+    counts[i] = slice_extend;
+  }
+  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+  // and initialize the free list by randomly threading through them
+  // set up first element
+  size_t current = _mi_heap_random(heap) % slice_count;
+  counts[current]--;
+  page->free = blocks[current];
+  // and iterate through the rest
+  uintptr_t rnd = heap->random;
+  for (size_t i = 1; i < extend; i++) {
+    // call random_shuffle only every INTPTR_SIZE rounds
+    size_t round = i%MI_INTPTR_SIZE;
+    if (round == 0) rnd = _mi_random_shuffle(rnd);
+    // select a random next slice index
+    size_t next = ((rnd >> 8*round) & (slice_count-1));
+    while (counts[next]==0) {                            // ensure it still has space
+      next++;
+      if (next==slice_count) next = 0;
+    }
+    // and link the current block to it
+    counts[next]--;
+    mi_block_t* block = blocks[current];
+    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
+    current = next;
+  }
+  mi_block_set_next(page, blocks[current], NULL);             // end of the list
+  heap->random = _mi_random_shuffle(rnd);
+}
+
+static void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
 {
   UNUSED(stats);
   mi_assert_internal(page->free == NULL);
@@ -413,66 +468,17 @@ static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t e
   void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
   size_t bsize = page->block_size;
   mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);
-  if (extend < MI_MIN_SLICES || !mi_option_is_enabled(mi_option_secure)) {
-    // initialize a sequential free list
-    mi_block_t* end = mi_page_block_at(page, page_area, page->capacity + extend - 1);
-    mi_block_t* block = start;
-    for (size_t i = 0; i < extend; i++) {
-      mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
-      mi_block_set_next(page,block,next);
-      block = next;
-    }
-    mi_block_set_next(page, end, NULL);
-    page->free = start;
-  }
-  else {
-    // initialize a randomized free list
-    // set up `slice_count` slices to alternate between
-    size_t shift  = MI_MAX_SLICE_SHIFT;
-    while ((extend >> shift) == 0) {
-      shift--;
-    }
-    size_t slice_count = (size_t)1U << shift;
-    size_t slice_extend = extend / slice_count;
-    mi_assert_internal(slice_extend >= 1);
-    mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
-    size_t      counts[MI_MAX_SLICES];   // available objects in the slice
-    for (size_t i = 0; i < slice_count; i++) {
-      blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend);
-      counts[i] = slice_extend;
-    }
-    counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
 
-    // and initialize the free list by randomly threading through them
-    // set up first element
-    size_t current = _mi_heap_random(heap) % slice_count;
-    counts[current]--;
-    page->free = blocks[current];
-    // and iterate through the rest
-    uintptr_t rnd = heap->random;
-    for (size_t i = 1; i < extend; i++) {
-      // call random_shuffle only every INTPTR_SIZE rounds
-      size_t round = i%MI_INTPTR_SIZE;
-      if (round == 0) rnd = _mi_random_shuffle(rnd);
-      // select a random next slice index
-      size_t next = ((rnd >> 8*round) & (slice_count-1));
-      while (counts[next]==0) {                            // ensure it still has space
-        next++;
-        if (next==slice_count) next = 0;
-      }
-      // and link the current block to it
-      counts[next]--;
-      mi_block_t* block = blocks[current];
-      blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
-      mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
-      current = next;
-    }
-    mi_block_set_next( page, blocks[current], NULL);             // end of the list
-    heap->random = _mi_random_shuffle(rnd);
+  // initialize a sequential free list
+  mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1);
+  mi_block_t* block = start;
+  while(block <= last) {
+    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+    mi_block_set_next(page,block,next);
+    block = next;
   }
-  // enable the new free list
-  page->capacity += (uint16_t)extend;
-  _mi_stat_increase(&stats->page_committed, extend * page->block_size);
+  mi_block_set_next(page, last, NULL);
+  page->free = start;
 }
 
 /* -----------------------------------------------------------
@@ -518,7 +524,15 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* st
   mi_assert_internal(extend < (1UL<<16));
 
   // and append the extend the free list
-  mi_page_free_list_extend(heap, page, extend, stats );
+  if (extend < MI_MIN_SLICES || !mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, extend, stats );
+  }
+  else {
+    mi_page_free_list_extend_secure(heap, page, extend, stats);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  _mi_stat_increase(&stats->page_committed, extend * page->block_size);
 
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
@@ -688,7 +702,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
   if (page != NULL) {
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(page->block_size == block_size);
-    mi_heap_stat_increase( heap, huge, block_size);
+    _mi_stat_increase( &heap->tld->stats.huge, block_size);
   }
   return page;
 }
@@ -708,10 +722,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
 
   // call potential deferred free routines
   _mi_deferred_free(heap, false);
-
+  
   // free delayed frees from other threads
   _mi_heap_delayed_free(heap);
-
+  
   // huge allocation?
   mi_page_t* page;
   if (mi_unlikely(size > MI_LARGE_SIZE_MAX)) {
diff --git a/src/segment.c b/src/segment.c
index 8f254a26..736345bf 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -226,6 +226,7 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
 
 
 static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
+  segment->thread_id = 0;
   mi_segments_track_size(-((long)segment_size),tld);
   if (mi_option_is_enabled(mi_option_secure)) {
     _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
@@ -235,8 +236,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
 
 
 // The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use,
-// and no more than 2.
-#define MI_SEGMENT_CACHE_MAX      (2)
+// and no more than 4.
+#define MI_SEGMENT_CACHE_MAX      (4)
 #define MI_SEGMENT_CACHE_FRACTION (8)
 
 // note: returned segment may be partially reset
@@ -248,17 +249,19 @@ static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t
   tld->cache = segment->next;
   segment->next = NULL;
   mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
+  _mi_stat_decrease(&tld->stats->segments_cache, 1);
   return segment;
 }
 
 static bool mi_segment_cache_full(mi_segments_tld_t* tld) {
-  if (tld->cache_count <  MI_SEGMENT_CACHE_MAX &&
-      tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { // always allow 1 element cache
+  if (tld->cache_count <  MI_SEGMENT_CACHE_MAX 
+      && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))
+     ) { // always allow 1 element cache
     return false;
   }
   // take the opportunity to reduce the segment cache if it is too large (now)
   // TODO: this never happens as we check against peak usage, should we use current usage instead?
-  while (tld->cache_count > (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
+  while (tld->cache_count > MI_SEGMENT_CACHE_MAX ) { //(1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
     mi_segment_t* segment = mi_segment_cache_pop(0,tld);
     mi_assert_internal(segment != NULL);
     if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld);
@@ -269,7 +272,9 @@ static bool mi_segment_cache_full(mi_segments_tld_t* tld) {
 static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
   mi_assert_internal(segment->next == NULL);
-  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) return false;
+  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
+    return false;
+  }
   mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
   if (mi_option_is_enabled(mi_option_cache_reset)) {
     _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
@@ -277,6 +282,7 @@ static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld)
   segment->next = tld->cache;
   tld->cache = segment;
   tld->cache_count++;
+  _mi_stat_increase(&tld->stats->segments_cache,1);
   return true;
 }
 
@@ -407,8 +413,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
   mi_assert(segment->next == NULL);
   mi_assert(segment->prev == NULL);
-  _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);
-  segment->thread_id = 0;
+  _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);  
 
   // update reset memory statistics
   /*
@@ -613,6 +618,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
         }
         else {
           // otherwise reclaim it
+          mi_page_init_flags(page,segment->thread_id);
           _mi_page_reclaim(heap,page);
         }
       }
@@ -643,6 +649,7 @@ static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tl
   mi_assert_internal(mi_segment_has_free(segment));
   mi_page_t* page = mi_segment_find_free(segment, tld->stats);
   page->segment_in_use = true;
+  mi_page_init_flags(page,segment->thread_id);
   segment->used++;
   mi_assert_internal(segment->used <= segment->capacity);
   if (segment->used == segment->capacity) {
@@ -682,6 +689,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_
   segment->used = 1;
   mi_page_t* page = &segment->pages[0];
   page->segment_in_use = true;
+  mi_page_init_flags(page,segment->thread_id);
   return page;
 }
 
@@ -693,22 +701,27 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
   segment->used = 1;
   mi_page_t* page = &segment->pages[0];
   page->segment_in_use = true;
+  mi_page_init_flags(page,segment->thread_id);
   return page;
 }
 
 /* -----------------------------------------------------------
    Page allocation and free
 ----------------------------------------------------------- */
+static bool mi_is_good_fit(size_t bsize, size_t size) {
+  // good fit if no more than 25% wasted
+  return (bsize > 0 && size > 0 && bsize < size && (size - (size % bsize)) < (size/4));
+}
 
 mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
   mi_page_t* page;
-  if (block_size <= (MI_SMALL_PAGE_SIZE/4)) {
+  if (block_size <= MI_SMALL_SIZE_MAX || mi_is_good_fit(block_size,MI_SMALL_PAGE_SIZE)) {
     page = mi_segment_small_page_alloc(tld,os_tld);
   }
-  else if (block_size <= (MI_MEDIUM_PAGE_SIZE/4)) {
+  else if (block_size <= MI_MEDIUM_SIZE_MAX || mi_is_good_fit(block_size, MI_MEDIUM_PAGE_SIZE)) {
     page = mi_segment_medium_page_alloc(tld, os_tld);
   }
-  else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t))) {
+  else if (block_size < MI_LARGE_SIZE_MAX || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t))) {
     page = mi_segment_large_page_alloc(tld, os_tld);
   }
   else {
diff --git a/src/stats.c b/src/stats.c
index 2b15bf9e..8725e48c 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -99,14 +99,14 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
   mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
   mi_stat_add(&stats->mmap_calls, &src->mmap_calls, 1);
-  mi_stat_add(&stats->mmap_ensure_aligned, &src->mmap_ensure_aligned, 1);
-  mi_stat_add(&stats->mmap_right_align, &src->mmap_right_align, 1);
   mi_stat_add(&stats->commit_calls, &src->commit_calls, 1);
   mi_stat_add(&stats->threads, &src->threads, 1);
   mi_stat_add(&stats->pages_extended, &src->pages_extended, 1);
 
   mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
   mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
@@ -172,10 +172,15 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
 }
 
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, FILE* out ) {
-  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
-  _mi_fprintf(out,"%10s: %7.1f avg\n", msg, avg);
+  _mi_fprintf(out, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out);
+  _mi_fprintf(out, "\n");
 }
 
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, FILE* out) {
+  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
+  _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg);
+}
 
 
 static void mi_print_header( FILE* out ) {
@@ -229,15 +234,15 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_n
   mi_stat_print(&stats->page_committed, "touched", 1, out);
   mi_stat_print(&stats->segments, "segments", -1, out);
   mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out);
   mi_stat_print(&stats->pages, "pages", -1, out);
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out);
   mi_stat_print(&stats->pages_extended, "-extended", 0, out);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out);
   mi_stat_print(&stats->mmap_calls, "mmaps", 0, out);
-  mi_stat_print(&stats->mmap_right_align, "mmap fast", 0, out);
-  mi_stat_print(&stats->mmap_ensure_aligned, "mmap slow", 0, out);
   mi_stat_print(&stats->commit_calls, "commits", 0, out);
   mi_stat_print(&stats->threads, "threads", 0, out);
-  mi_stat_counter_print(&stats->searches, "searches", out);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out);
 
   if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
 
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 6ddf4f37..94891cc3 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -6,6 +6,7 @@
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.
 
+
 int main() {
   mi_version();
   void* p1 = malloc(78);