initial checkin

2025-08-24 00:04:48 +03:00 · 2019-06-19 16:26:12 -07:00 · 2019-06-19 16:26:12 -07:00 · 26a874eb3f
commit 26a874eb3f
parent 23b4e65faa
41 changed files with 11897 additions and 0 deletions
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  // note: we don't require `size > offset`, we just guarantee that
+  // the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_malloc_zero(heap,size,zero);
+  if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
+
+  // try if there is a current small block with just the right alignment
+  if (size <= MI_SMALL_SIZE_MAX) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+    if (page->free != NULL &&
+        (((uintptr_t)page->free + offset) % alignment) == 0)
+    {
+      #if MI_STAT>1
+        mi_heap_stat_increase( heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap,page,size);
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      if (zero) memset(p,0,size);
+      return p;
+    }
+  }
+
+  // otherwise over-allocate
+  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
+  if (p == NULL) return NULL;
+
+  // .. and align within the allocation
+  _mi_ptr_page(p)->flags.has_aligned = true;
+  uintptr_t adjust = alignment - (((uintptr_t)p + offset) % alignment);
+  mi_assert_internal(adjust % sizeof(uintptr_t) == 0);
+  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal( p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p),_mi_ptr_page(aligned_p),aligned_p) );
+  return aligned_p;
+}
+
+static void* mi_malloc_zero_aligned_at(size_t size, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(mi_get_default_heap(),size,alignment,offset,zero);
+}
+
+void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_malloc_zero_aligned_at(size, alignment, offset, false);
+}
+
+void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_malloc_aligned_at(size, alignment, 0);
+}
+
+void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_malloc_zero_aligned_at(size,alignment,offset,true);
+}
+
+void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_zalloc_aligned_at(size,alignment,0);
+}
+
+void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_zalloc_aligned_at(total,alignment,offset);
+}
+
+void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_zalloc_aligned(total,alignment);
+}
+
+
+static void* mi_realloc_zero_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_realloc_zero(p,newsize,zero);
+  if (p == NULL) return mi_malloc_zero_aligned_at(newsize,alignment,offset,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size - (size / 2))
+      && (((uintptr_t)p + offset) % alignment) == 0) {
+    return p;  // reallocation still fits, is aligned and not more than 50% waste
+  }
+  else {
+    void* newp = mi_malloc_aligned_at(newsize,alignment,offset);
+    if (newp != NULL) {
+      if (zero && newsize > size) {
+        // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+        size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+        memset((uint8_t*)newp + start, 0, newsize - start);
+      }
+      memcpy(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free if succesfull
+    }
+    return newp;
+  }
+}
+
+static void* _mi_realloc_aligned(void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_realloc_zero(p,newsize,zero);
+  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
+  return mi_realloc_zero_aligned_at(p,newsize,alignment,offset,zero);
+}
+
+void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_realloc_zero_aligned_at(p,newsize,alignment,offset,false);
+}
+
+void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return _mi_realloc_aligned(p,newsize,alignment,false);
+}
+
+void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_realloc_zero_aligned_at(p,newsize,alignment,offset,true);
+}
+
+void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return _mi_realloc_aligned(p,newsize,alignment,true);
+}
+
+void* mi_recalloc_aligned_at(void* p, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_rezalloc_aligned_at(p,total,alignment,offset);
+}
+
+void* mi_recalloc_aligned(void* p, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_rezalloc_aligned(p,total,alignment);
+}
+
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#if defined(MI_MALLOC_OVERRIDE) 
+
+#if !defined(__APPLE__)
+#error "this file should only be included on MacOSX"
+#endif
+
+/* ------------------------------------------------------
+   Override system malloc on MacOSX
+   This is done through the malloc zone interface.
+------------------------------------------------------ */
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+// only available from OSX 10.6
+extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
+#endif
+
+
+/* ------------------------------------------------------
+   malloc zone members
+------------------------------------------------------ */
+
+static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  return 0; // as we cannot guarantee that `p` comes from us, just return 0
+}
+
+static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  return mi_malloc(size);
+}
+
+static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  return mi_calloc(count, size);
+}
+
+static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+static void zone_free(malloc_zone_t* zone, void* p) {
+  return mi_free(p);
+}
+
+static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  return mi_realloc(p, newsize);
+}
+
+static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  return mi_malloc_aligned(size,alignment);
+}
+
+static void zone_destroy(malloc_zone_t* zone) {
+  // todo: ignore for now?
+}
+
+static size_t zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, size_t count) {
+  size_t i;
+  for (i = 0; i < count; i++) {
+    ps[i] = zone_malloc(zone, size);
+    if (ps[i] == NULL) break;
+  }
+  return i;
+}
+
+static void zone_batch_free(malloc_zone_t* zone, void** ps, size_t count) {
+  for(size_t i = 0; i < count; i++) {
+    zone_free(zone, ps[i]);
+    ps[i] = NULL;
+  }
+}
+
+static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  mi_collect(false);
+  return 0;
+}
+
+static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  zone_free(zone,p);
+}
+
+
+/* ------------------------------------------------------
+   Introspection members
+------------------------------------------------------ */
+
+static kern_return_t intro_enumerator(task_t task, void* p,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder)
+{
+  // todo: enumerate all memory
+  return KERN_SUCCESS;
+}
+
+static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  return mi_good_size(size);
+}
+
+static boolean_t intro_check(malloc_zone_t* zone) {
+  return true;
+}
+
+static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  mi_stats_print(NULL);
+}
+
+static void intro_log(malloc_zone_t* zone, void* p) {
+  // todo?
+}
+
+static void intro_force_lock(malloc_zone_t* zone) {
+  // todo?
+}
+
+static void intro_force_unlock(malloc_zone_t* zone) {
+  // todo?
+}
+
+static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  // todo...
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  return false;
+}
+
+
+/* ------------------------------------------------------
+  At process start, override the default allocator
+------------------------------------------------------ */
+
+static malloc_zone_t* mi_get_default_zone()
+{
+  // The first returned zone is the real default
+  malloc_zone_t** zones = NULL;
+  size_t count = 0;
+  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
+  if (ret == KERN_SUCCESS && count > 0) {
+    return zones[0];
+  }
+  else {
+    // fallback
+    return malloc_default_zone();
+  }
+}
+
+
+static void __attribute__((constructor)) _mi_macosx_override_malloc()
+{
+  static malloc_introspection_t intro;
+  memset(&intro, 0, sizeof(intro));
+
+  intro.enumerator = &intro_enumerator;
+  intro.good_size = &intro_good_size;
+  intro.check = &intro_check;
+  intro.print = &intro_print;
+  intro.log = &intro_log;
+  intro.force_lock = &intro_force_lock;
+  intro.force_unlock = &intro_force_unlock;
+
+  static malloc_zone_t zone;
+  memset(&zone, 0, sizeof(zone));
+
+  zone.version = 4;
+  zone.zone_name = "mimalloc";
+  zone.size = &zone_size;
+  zone.introspect = &intro;
+  zone.malloc = &zone_malloc;
+  zone.calloc = &zone_calloc;
+  zone.valloc = &zone_valloc;
+  zone.free = &zone_free;
+  zone.realloc = &zone_realloc;
+  zone.destroy = &zone_destroy;
+  zone.batch_malloc = &zone_batch_malloc;
+  zone.batch_free = &zone_batch_free;
+
+  malloc_zone_t* purgeable_zone = NULL;
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // rwitch to version 9 on OSX 10.6 to support memalign.
+  zone.version = 9;
+  zone.memalign = &zone_memalign;
+  zone.free_definite_size = &zone_free_definite_size;
+  zone.pressure_relief = &zone_pressure_relief;
+  intro.zone_locked = &intro_zone_locked;
+
+  // force the purgable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+#endif
+
+  // Register our zone
+  malloc_zone_register(&zone);
+
+  // Unregister the default zone, this makes our zone the new default
+  // as that was the last registered.
+  malloc_zone_t *default_zone = mi_get_default_zone();
+  malloc_zone_unregister(default_zone);
+
+  // Reregister the default zone so free and realloc in that zone keep working.
+  malloc_zone_register(default_zone);
+
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+}
+
+#endif // MI_MALLOC_OVERRIDE
--- a/src/alloc-override-win.c
+++ b/src/alloc-override-win.c
@ -0,0 +1,650 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#if !defined(_WIN32)
+#error "this file should only be included on Windows"
+#endif
+
+#include <windows.h>
+#include <psapi.h>
+
+
+/*
+To override the C runtime `malloc` on Windows we need to patch the allocation
+functions at runtime initialization. Unfortunately we can never patch before the
+runtime initializes itself, because as soon as we call `GetProcAddress` on the
+runtime module (a DLL or EXE in Windows speak), it will first load and initialize
+(by the OS calling `DllMain` on it).
+
+This means that some things might be already allocated by the C runtime itself
+(and possibly other DLL's) before we get to resolve runtime adresses. This is
+no problem if everyone unwinds in order: when we unload, we unpatch and restore
+the original crt `free` routines and crt malloc'd memory is freed correctly.
+
+But things go wrong if such early CRT alloc'd memory is freed or re-allocated
+_after_ we patch, but _before_ we unload (and unpatch), or if any memory allocated
+by us is freed after we unpatched.
+
+There are two tricky situations to deal with:
+
+1. The Thread Local Storage (TLS): when the main thead stops it will call registered
+   callbacks on TLS entries (allocated by `FlsAlloc`). This is done by the OS
+   before any DLL's are unloaded. Unfortunately, the C runtime registers such
+   TLS entries with CRT allocated memory which is freed in the callback.
+
+2. Inside the CRT:
+   a. Some variables might get initialized by patched allocated
+      blocks but freed during CRT unloading after we unpatched
+      (like temporary file buffers).
+   b. Some blocks are allocated at CRT and freed by the CRT (like the
+      environment storage).
+   c. And some blocks are allocated by the CRT and then reallocated
+      while patched, and finally freed after unpatching! This
+      happens with the `atexit` functions for example to grow the array
+      of registered functions.
+
+In principle situation 2 is hopeless: since we cannot patch before CRT initialization,
+we can never be sure how to free or reallocate a pointer during CRT unloading.
+However, in practice there is a good solution: when terminating, we just patch
+the reallocation and free routines to no-ops -- we are winding down anyway! This leaves
+just the reallocation problm of CRT alloc'd memory once we are patched. Here, a study of the
+CRT reveals that there seem to be just three such situations:
+
+1. When registering `atexit` routines (to grow the exit function table),
+2. When calling `_setmaxstdio` (to grow the file handle table),
+3. and `_popen`/`_wpopen` (to grow handle pairs). These turn out not to be
+   a problem as these are NULL initialized.
+
+We fix these by providing wrappers:
+
+1. We first register a _global_ `atexit` routine ourselves (`mi_patches_at_exit`) before patching,
+   and then patch the `_crt_atexit` function to implement our own global exit list (and the
+   same for `_crt_at_quick_exit`). All module local lists are no problem since they are always fully
+   (un)patched from initialization to end. We can register in the global list by dynamically
+   getting the global `_crt_atexit` entry from `ucrtbase.dll`.
+
+2. The `_setmaxstdio`  is _detoured_: we patch it by a stub that unpatches first,
+   calls the original routine and repatches again.
+
+That leaves us to reliably shutdown and enter "termination mode":
+
+1. Using our trick to get the global exit list entry point, we register an exit function `mi_patches_atexit`
+   that first executes all our home brew list of exit functions, and then enters a _termination_
+   phase that patches realloc/free variants with no-ops. Patching later again with special no-ops for
+   `free` also improves efficiency during the program run since no flags need to be checked.
+
+2. That is not quite good enough yet since after executing exit routines after us on the
+   global exit list (registered by the CRT),
+   the OS starts to unwind the TLS callbacks and we would like to run callbacks registered after loading
+   our DLL to be done in patched mode. So, we also allocate a TLS entry when our DLL is loaded and when its
+   callback is called, we re-enable the original patches again. Since TLS is destroyed in FIFO order
+   this runs any callbacks in later DLL's in patched mode.
+
+3. Finally the DLL's get unloaded by the OS in order (still patched) until our DLL gets unloaded
+   and then we start a termination phase again, and patch realloc/free with no-ops for good this time.
+
+*/
+
+static int __cdecl mi_setmaxstdio(int newmax);
+
+// ------------------------------------------------------
+// Microsoft allocation extensions
+// ------------------------------------------------------
+
+#define UNUSED(x) (void)(x)  // suppress unused variable warnings
+
+static void* mi__expand(void* p, size_t newsize) {
+  void* res = mi_expand(p, newsize);
+  if (res == NULL) errno = ENOMEM;
+  return res;
+}
+
+
+// Versions of `free`, `realloc`, `recalloc`, `expand` and `msize`
+// that are used during termination and are no-ops.
+static void mi_free_term(void* p) {
+  UNUSED(p);
+}
+
+static void* mi_realloc_term(void* p, size_t newsize) {
+  UNUSED(p); UNUSED(newsize);
+  return NULL;
+}
+
+static void* mi__recalloc_term(void* p, size_t newcount, size_t newsize) {
+  UNUSED(p); UNUSED(newcount); UNUSED(newsize);
+  return NULL;
+}
+
+static void* mi__expand_term(void* p, size_t newsize) {
+  UNUSED(p); UNUSED(newsize);
+  return NULL;
+}
+
+static size_t mi__msize_term(void* p) {
+  UNUSED(p);
+  return 0;
+}
+
+
+// Debug versions, forward to base versions (that get patched)
+
+static void* mi__malloc_dbg(size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _malloc_base(size);
+}
+
+static void* mi__calloc_dbg(size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _calloc_base(count, size);
+}
+
+static void* mi__realloc_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return _realloc_base(p, size);
+}
+
+static void mi__free_dbg(void* p, int block_type) {
+  UNUSED(block_type);
+  _free_base(p);
+}
+
+
+// the `recalloc`,`expand`, and `msize` don't have base versions and thus need a separate term version
+
+static void* mi__recalloc_dbg(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi_recalloc(p, count, size);
+}
+
+static void* mi__expand_dbg(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__expand(p, size);
+}
+
+static size_t mi__msize_dbg(void* p, int block_type) {
+  UNUSED(block_type);
+  return mi_usable_size(p);
+}
+
+static void* mi__recalloc_dbg_term(void* p, size_t count, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__recalloc_term(p, count, size);
+}
+
+static void* mi__expand_dbg_term(void* p, size_t size, int block_type, const char* fname, int line) {
+  UNUSED(block_type); UNUSED(fname); UNUSED(line);
+  return mi__expand_term(p, size);
+}
+
+static size_t mi__msize_dbg_term(void* p, int block_type) {
+  UNUSED(block_type);
+  return mi__msize_term(p);
+}
+
+
+// ------------------------------------------------------
+// implement our own global atexit handler
+// ------------------------------------------------------
+typedef void (cbfun_t)();
+typedef int (atexit_fun_t)(cbfun_t* fn);
+typedef uintptr_t encoded_t;
+
+typedef struct exit_list_s {
+  encoded_t  functions;  // encoded pointer to array of encoded function pointers
+  size_t     count;
+  size_t     capacity;
+} exit_list_t;
+
+#define MI_EXIT_INC (64)
+
+static exit_list_t atexit_list = { 0, 0, 0 };
+static exit_list_t at_quick_exit_list = { 0, 0, 0 };
+static CRITICAL_SECTION atexit_lock;
+
+// encode/decode function pointers with a random canary for security
+static encoded_t canary;
+
+static inline void *decode(encoded_t x) {
+  return (void*)(x^canary);
+}
+
+static inline encoded_t encode(void* p) {
+  return ((uintptr_t)p ^ canary);
+}
+
+
+static void init_canary()
+{
+  canary = _mi_random_init(0);
+  atexit_list.functions = at_quick_exit_list.functions = encode(NULL);
+}
+
+
+// initialize the list
+static void mi_initialize_atexit() {
+  InitializeCriticalSection(&atexit_lock);
+  init_canary();
+}
+
+// register an exit function
+static int mi_register_atexit(exit_list_t* list, cbfun_t* fn) {
+  if (fn == NULL) return EINVAL;
+  EnterCriticalSection(&atexit_lock);
+  encoded_t* functions = (encoded_t*)decode(list->functions);
+  if (list->count >= list->capacity) {   // at first `functions == decode(0) == NULL`
+    encoded_t* newf = (encoded_t*)mi_recalloc(functions, list->capacity + MI_EXIT_INC, sizeof(cbfun_t*));
+    if (newf != NULL) {
+      list->capacity += MI_EXIT_INC;
+      list->functions = encode(newf);
+      functions = newf;
+    }
+  }
+  int result;
+  if (list->count < list->capacity && functions != NULL) {
+    functions[list->count] = encode(fn);
+    list->count++;
+    result = 0; // success
+  }
+  else {
+    result = ENOMEM;
+  }
+  LeaveCriticalSection(&atexit_lock);
+  return result;
+}
+
+// Register a global `atexit` function
+static int mi__crt_atexit(cbfun_t* fn) {
+  return mi_register_atexit(&atexit_list,fn);
+}
+
+static int mi__crt_at_quick_exit(cbfun_t* fn) {
+  return mi_register_atexit(&at_quick_exit_list,fn);
+}
+
+
+// Execute exit functions in a list
+static void mi_execute_exit_list(exit_list_t* list) {
+  // copy and zero the list structure
+  EnterCriticalSection(&atexit_lock);
+  exit_list_t clist = *list;
+  memset(list,0,sizeof(*list));
+  LeaveCriticalSection(&atexit_lock);
+
+  // now execute the functions outside of the lock
+  encoded_t* functions = (encoded_t*)decode(clist.functions);
+  if (functions != NULL) {
+    for (size_t i = clist.count; i > 0; i--) {  // careful with unsigned count down..
+      cbfun_t* fn = (cbfun_t*)decode(functions[i-1]);
+      if (fn==NULL) break; // corrupted!
+      fn();
+    }
+    mi_free(functions);
+  }
+}
+
+
+
+// ------------------------------------------------------
+// Jump assembly instructions for patches
+// ------------------------------------------------------
+
+#if defined(_M_IX86) || defined(_M_X64)
+
+#define MI_JUMP_SIZE  14   // at most 2+4+8 for a long jump or 1+5 for a short one
+
+typedef struct mi_jump_s {
+  uint8_t opcodes[MI_JUMP_SIZE];
+} mi_jump_t;
+
+void mi_jump_restore(void* current, const mi_jump_t* saved) {
+  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
+}
+
+void mi_jump_write(void* current, void* target, mi_jump_t* save) {
+  if (save != NULL) {
+    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
+  }
+  uint8_t*   opcodes = ((mi_jump_t*)current)->opcodes;
+  ptrdiff_t  diff    = (uint8_t*)target - (uint8_t*)current;
+  uint32_t   ofs32   = (uint32_t)diff;
+  #ifdef _M_X64
+  uint64_t   ofs64   = (uint64_t)diff;
+  if (ofs64 != (uint64_t)ofs32) {
+    // use long jump
+    opcodes[0] = 0xFF;
+    opcodes[1] = 0x25;
+    *((uint32_t*)&opcodes[2]) = 0;
+    *((uint64_t*)&opcodes[6]) = (uint64_t)target;
+  }
+  else
+  #endif
+  {
+    // use short jump
+    opcodes[0] = 0xE9;
+    *((uint32_t*)&opcodes[1]) = ofs32 - 5 /* size of the short jump instruction */;
+  }
+}
+
+#elif defined(_M_ARM64)
+
+#define MI_JUMP_SIZE  16
+
+typedef struct mi_jump_s {
+  uint8_t opcodes[MI_JUMP_SIZE];
+} mi_jump_t;
+
+void mi_jump_restore(void* current, const mi_jump_t* saved) {
+  memcpy(current, &saved->opcodes, MI_JUMP_SIZE);
+}
+
+void mi_jump_write(void* current, void* target, mi_jump_t* save) {
+  if (save != NULL) {
+    memcpy(&save->opcodes, current, MI_JUMP_SIZE);
+  }
+  uint8_t*  opcodes = ((mi_jump_t*)current)->opcodes;
+  uint64_t  diff = (uint8_t*)target - (uint8_t*)current;
+
+  // 0x50 0x00 0x00 0x58   ldr x16, .+8   # load PC relative +8
+  // 0x00 0x02 0x3F 0xD6   blr x16        # and jump
+  //                       <address>
+  //                       <address>
+  static const uint8_t jump_opcodes[8] = { 0x50, 0x00, 0x00, 0x58, 0x00, 0x02, 0x3F, 0xD6 };
+  memcpy(&opcodes[0], jump_opcodes, sizeof(jump_opcodes));
+  *((uint64_t*)&opcodes[8]) = diff;
+}
+
+#else
+#error "define jump instructions for this platform"
+#endif
+
+
+// ------------------------------------------------------
+// Patches
+// ------------------------------------------------------
+typedef enum patch_apply_e {
+  PATCH_NONE,
+  PATCH_TARGET,
+  PATCH_TARGET_TERM
+} patch_apply_t;
+
+typedef struct mi_patch_s {
+  const char*   name;       // name of the function to patch
+  void*         original;   // the resolved address of the function (or NULL)
+  void*         target;     // the address of the new target (never NULL)
+  void*         target_term;// the address of the target during termination (or NULL)
+  patch_apply_t applied;    // what target has been applied?
+  mi_jump_t     save;       // the saved instructions in case it was applied
+} mi_patch_t;
+
+#define MI_PATCH_NAME3(name,target,term)  { name, NULL, &target, &term, false }
+#define MI_PATCH_NAME2(name,target)       { name, NULL, &target, NULL, false }
+#define MI_PATCH3(name,target,term)       MI_PATCH_NAME3(#name, target, term)
+#define MI_PATCH2(name,target)            MI_PATCH_NAME2(#name, target)
+#define MI_PATCH1(name)                   MI_PATCH2(name,mi_##name)
+
+static mi_patch_t patches[] = {
+  // we implement our own global exit handler (as the CRT versions do a realloc internally)
+  MI_PATCH2(_crt_atexit, mi__crt_atexit),
+  MI_PATCH2(_crt_at_quick_exit, mi__crt_at_quick_exit),
+  MI_PATCH2(_setmaxstdio, mi_setmaxstdio),
+
+  // base versions
+  MI_PATCH2(_malloc_base, mi_malloc),
+  MI_PATCH2(_calloc_base, mi_calloc),
+  MI_PATCH3(_realloc_base, mi_realloc,mi_realloc_term),
+  MI_PATCH3(_free_base, mi_free,mi_free_term),
+
+  // regular entries
+  MI_PATCH3(_expand, mi__expand,mi__expand_term),
+  MI_PATCH3(_recalloc, mi_recalloc,mi__recalloc_term),
+  MI_PATCH3(_msize, mi_usable_size,mi__msize_term),
+
+  // these base versions are in the crt but without import records
+  MI_PATCH_NAME3("_recalloc_base", mi_recalloc,mi__recalloc_term),
+  MI_PATCH_NAME3("_msize_base", mi_usable_size,mi__msize_term),
+
+  // utility
+  MI_PATCH2(_strdup, mi_strdup),
+  MI_PATCH2(_strndup, mi_strndup),
+
+  // debug
+  MI_PATCH2(_malloc_dbg, mi__malloc_dbg),
+  MI_PATCH2(_realloc_dbg, mi__realloc_dbg),
+  MI_PATCH2(_calloc_dbg, mi__calloc_dbg),
+  MI_PATCH2(_free_dbg, mi__free_dbg),
+
+  MI_PATCH3(_expand_dbg, mi__expand_dbg, mi__expand_dbg_term),
+  MI_PATCH3(_recalloc_dbg, mi__recalloc_dbg, mi__recalloc_dbg_term),
+  MI_PATCH3(_msize_dbg, mi__msize_dbg, mi__msize_dbg_term),
+
+#ifdef _WIN64
+  // 64 bit new/delete
+  MI_PATCH_NAME2("??2@YAPEAX_K@Z", mi_malloc),
+  MI_PATCH_NAME2("??_U@YAPEAX_K@Z", mi_malloc),
+  MI_PATCH_NAME3("??3@YAXPEAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??_V@YAXPEAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME2("??2@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_malloc),
+  MI_PATCH_NAME2("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_malloc),
+  MI_PATCH_NAME3("??3@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??_V@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free, mi_free_term),
+#else
+  // 32 bit new/delete
+  MI_PATCH_NAME2("??2@YAPAXI@Z", mi_malloc),
+  MI_PATCH_NAME2("??_U@YAPAXI@Z", mi_malloc),
+  MI_PATCH_NAME3("??3@YAXPAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??_V@YAXPAX@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME2("??2@YAPAXIABUnothrow_t@std@@@Z", mi_malloc),
+  MI_PATCH_NAME2("??_U@YAPAXIABUnothrow_t@std@@@Z", mi_malloc),
+  MI_PATCH_NAME3("??3@YAXPAXABUnothrow_t@std@@@Z", mi_free, mi_free_term),
+  MI_PATCH_NAME3("??_V@YAXPAXABUnothrow_t@std@@@Z", mi_free, mi_free_term),
+#endif
+
+  { NULL, NULL, NULL, false }
+};
+
+
+// Apply a patch
+static bool mi_patch_apply(mi_patch_t* patch, patch_apply_t apply)
+{
+  if (patch->original == NULL) return true;  // unresolved
+  if (apply == PATCH_TARGET_TERM && patch->target_term == NULL) apply = PATCH_TARGET;  // avoid re-applying non-term variants
+  if (patch->applied == apply) return false;
+
+  DWORD protect = PAGE_READWRITE;
+  if (!VirtualProtect(patch->original, MI_JUMP_SIZE, PAGE_EXECUTE_READWRITE, &protect)) return false;
+  if (apply == PATCH_NONE) {
+    mi_jump_restore(patch->original, &patch->save);
+  }
+  else {
+    void* target = (apply == PATCH_TARGET ? patch->target : patch->target_term);
+    mi_assert_internal(target!=NULL);
+    if (target != NULL) mi_jump_write(patch->original, target, &patch->save);
+  }
+  patch->applied = apply;
+  VirtualProtect(patch->original, MI_JUMP_SIZE, protect, &protect);
+  return true;
+}
+
+// Apply all patches
+static bool _mi_patches_apply(patch_apply_t apply, patch_apply_t* previous) {
+  static patch_apply_t current = PATCH_NONE;
+  if (previous != NULL) *previous = current;
+  if (current == apply) return true;
+  current = apply;
+  bool ok = true;
+  for (size_t i = 0; patches[i].name != NULL; i++) {
+    if (!mi_patch_apply(&patches[i], apply)) ok = false;
+  }
+  return ok;
+}
+
+// Export the following three functions just in case
+// a user needs that level of control.
+
+// Disable all patches
+mi_decl_export void mi_patches_disable(void) {
+  _mi_patches_apply(PATCH_NONE, NULL);
+}
+
+// Enable all patches normally
+mi_decl_export bool mi_patches_enable() {
+  return _mi_patches_apply( PATCH_TARGET, NULL );
+}
+
+// Enable all patches in termination phase where free is a no-op
+mi_decl_export bool mi_patches_enable_term() {
+  return _mi_patches_apply(PATCH_TARGET_TERM, NULL);
+}
+
+// ------------------------------------------------------
+// Stub for _setmaxstdio
+// ------------------------------------------------------
+
+static int __cdecl mi_setmaxstdio(int newmax) {
+  patch_apply_t previous;
+  _mi_patches_apply(PATCH_NONE, &previous); // disable patches
+  int result = _setmaxstdio(newmax);       // call original function (that calls original CRT recalloc)
+  _mi_patches_apply(previous,NULL);         // and re-enable patches
+  return result;
+}
+
+
+// ------------------------------------------------------
+// Resolve addresses dynamically
+// ------------------------------------------------------
+
+// Try to resolve patches for a given module (DLL)
+static void mi_module_resolve(HMODULE mod) {
+  // see if any patches apply
+  for (size_t i = 0; patches[i].name != NULL; i++) {
+    mi_patch_t* patch = &patches[i];
+    if (!patch->applied && patch->original==NULL) {
+      void* addr = GetProcAddress(mod, patch->name);
+      if (addr != NULL) {
+        // found it! set the address
+        patch->original = addr;
+      }
+    }
+  }
+}
+
+#define MIMALLOC_NAME "mimalloc-override"
+#define UCRTBASE_NAME "ucrtbase"
+
+// Resolve addresses of all patches by inspecting the loaded modules
+static atexit_fun_t* crt_atexit = NULL;
+static atexit_fun_t* crt_at_quick_exit = NULL;
+
+
+static bool mi_patches_resolve() {
+  // get all loaded modules
+  HANDLE process = GetCurrentProcess(); // always -1, no need to release
+  DWORD needed = 0;
+  HMODULE modules[400];  // try to stay under 4k to not trigger the guard page
+  EnumProcessModules(process, modules, sizeof(modules), &needed);
+  if (needed == 0) return false;
+  size_t count = needed / sizeof(HMODULE);
+  size_t ucrtbase_index = 0;
+  size_t mimalloc_index = 0;
+  // iterate through the loaded modules
+  for (size_t i = 0; i < count; i++) {
+    HMODULE mod = modules[i];
+    char filename[MAX_PATH] = { 0 };
+    DWORD slen = GetModuleFileName(mod, filename, MAX_PATH);
+    if (slen > 0 && slen < MAX_PATH) {
+      // filter out potential crt modules only
+      filename[slen] = 0;
+      const char* lastsep = strrchr(filename, '\\');
+      const char* basename = (lastsep==NULL ? filename : lastsep+1);
+      if (i==0                                    // main module to allow static crt linking
+        || _strnicmp(basename, "ucrt", 4) == 0    // new ucrtbase.dll in windows 10
+        || _strnicmp(basename, "msvcr", 5) == 0)  // older runtimes
+      {
+        // remember indices so we can check load order (in debug mode)
+        if (_stricmp(basename, MIMALLOC_NAME) == 0) mimalloc_index = i;
+        if (_stricmp(basename, UCRTBASE_NAME) == 0) ucrtbase_index = i;
+
+        // probably found a crt module, try to patch it
+        mi_module_resolve(mod);
+
+        // try to find the atexit functions for the main process (in `ucrtbase.dll`)
+        if (crt_atexit==NULL) crt_atexit = (atexit_fun_t*)GetProcAddress(mod, "_crt_atexit");
+        if (crt_at_quick_exit == NULL) crt_at_quick_exit = (atexit_fun_t*)GetProcAddress(mod, "_crt_at_quick_exit");
+      }
+    }
+  }
+#if (MI_DEBUG)
+  size_t diff = (mimalloc_index > ucrtbase_index ? mimalloc_index - ucrtbase_index : ucrtbase_index - mimalloc_index);
+  if ((mimalloc_index > 0 || ucrtbase_index > 0) && (diff != 1)) {
+    _mi_warning_message("warning: the \"mimalloc-override\" DLL seems not to load right before or after the C runtime (\"ucrtbase\").\n"
+                        "  Try to fix this by changing the linking order.");
+  }
+#endif
+  return true;
+}
+
+
+// ------------------------------------------------------
+// Dll Entry
+// ------------------------------------------------------
+
+extern BOOL WINAPI _DllMainCRTStartup(HINSTANCE inst, DWORD reason, LPVOID reserved);
+
+static DWORD mi_fls_unwind_entry;
+static void NTAPI mi_fls_unwind(PVOID value) {
+  if (value != NULL) mi_patches_enable();   // and re-enable normal patches again for DLL's loaded after us
+  return;
+}
+
+static void mi_patches_atexit() {
+  mi_execute_exit_list(&atexit_list);
+  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
+}
+
+static void mi_patches_at_quick_exit() {
+  mi_execute_exit_list(&at_quick_exit_list);
+  mi_patches_enable_term();             // enter termination phase and patch realloc/free with a no-op
+}
+
+__declspec(dllexport) BOOL WINAPI DllEntry(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+  if (reason == DLL_PROCESS_ATTACH) {
+    __security_init_cookie();
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    // enter termination phase for good now
+    mi_patches_enable_term();
+  }
+  // C runtime main
+  BOOL ok = _DllMainCRTStartup(inst, reason, reserved);
+  if (reason == DLL_PROCESS_ATTACH && ok) {
+    // Now resolve patches
+    ok = mi_patches_resolve();
+    if (ok) {
+      // and register our unwind entry (this must be after resolving due to possible delayed DLL initialization from GetProcAddress)
+      mi_fls_unwind_entry = FlsAlloc(&mi_fls_unwind);
+      if (mi_fls_unwind_entry != FLS_OUT_OF_INDEXES) {
+        FlsSetValue(mi_fls_unwind_entry, (void*)1);
+      }
+
+      // register our patch disabler in the global exit list
+      mi_initialize_atexit();
+      if (crt_atexit != NULL)        (*crt_atexit)(&mi_patches_atexit);
+      if (crt_at_quick_exit != NULL) (*crt_at_quick_exit)(&mi_patches_at_quick_exit);
+
+      // and patch !  this also redirects the `atexit` handling for the global exit list
+      mi_patches_enable();
+
+      // hide internal allocation
+      mi_stats_reset();
+    }
+  }
+  return ok;
+}
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
+#error "It is only possible to override malloc on Windows when building as a DLL (and linking the C runtime as a DLL)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+
+// ------------------------------------------------------
+// Override system malloc
+// ------------------------------------------------------
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4273)  // inconsistent dll linking
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
+  // use aliasing to alias the exported function to one of our `mi_` functions
+  #define MI_FORWARD(fun)      __attribute__((alias(#fun), used));
+  #define MI_FORWARD1(fun,x)   MI_FORWARD(fun)
+  #define MI_FORWARD2(fun,x,y) MI_FORWARD(fun)
+  #define MI_FORWARD0(fun,x)   MI_FORWARD(fun)
+#else
+  // use forwarding by calling our `mi_` function
+  #define MI_FORWARD1(fun,x)   { return fun(x); }
+  #define MI_FORWARD2(fun,x,y) { return fun(x,y); }
+  #define MI_FORWARD0(fun,x)   { fun(x); }
+#endif
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+  struct mi_interpose_s {
+    const void* replacement;
+    const void* target;
+  };
+  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_MI(malloc),
+    MI_INTERPOSE_MI(calloc),
+    MI_INTERPOSE_MI(realloc),
+    MI_INTERPOSE_MI(free)
+  };
+#else
+  // On all other systems forward to our API
+  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size)
+  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n)
+  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize)
+  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p)
+#endif
+
+// ------------------------------------------------------
+// Override new/delete
+// This is not really necessary as they usually call
+// malloc/free anyway, but it improves performance.
+// ------------------------------------------------------
+#ifdef __cplusplus
+  // ------------------------------------------------------
+  // With a C++ compiler we override the new/delete operators.
+  // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+  // ------------------------------------------------------
+  #include <new>
+  void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
+  void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)
+  void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_malloc,n)
+  void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_malloc,n)
+
+  #if (__cplusplus >= 201703L)
+  void* operator new( std::size_t n, std::align_val_t align) noexcept(false)   MI_FORWARD2(mi_malloc_aligned,n,align)
+  void* operator new[]( std::size_t n, std::align_val_t align) noexcept(false) MI_FORWARD2(mi_malloc_aligned,n,align)
+  #endif
+#else
+  // ------------------------------------------------------
+  // With a C compiler we override the new/delete operators
+  // by defining the mangled C++ names of the operators (as
+  // used by GCC and CLang).
+  // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
+  // ------------------------------------------------------
+  void _ZdlPv(void* p) MI_FORWARD0(mi_free,p) // delete
+  void _ZdaPv(void* p) MI_FORWARD0(mi_free,p) // delete[]
+  #if (MI_INTPTR_SIZE==8)
+    void* _Znwm(uint64_t n)                  MI_FORWARD1(mi_malloc,n)               // new 64-bit
+    void* _Znam(uint64_t n)                  MI_FORWARD1(mi_malloc,n)               // new[] 64-bit
+    void* _Znwmm(uint64_t n, uint64_t align) { return mi_malloc_aligned(n,align); } // aligned new 64-bit
+    void* _Znamm(uint64_t n, uint64_t align) { return mi_malloc_aligned(n,align); }  // aligned new[] 64-bit
+  #elif (MI_INTPTR_SIZE==4)
+    void* _Znwj(uint32_t n)                  MI_FORWARD1(mi_malloc,n)               // new 32-bit
+    void* _Znaj(uint32_t n)                  MI_FORWARD1(mi_malloc,n)               // new[] 32-bit
+    void* _Znwjj(uint32_t n, uint32_t align) { return mi_malloc_aligned(n,align); }  // aligned new 32-bit
+    void* _Znajj(uint32_t n, uint32_t align) { return mi_malloc_aligned(n,align); }  // aligned new[] 32-bit
+  #else
+  #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+  #endif
+#endif // __cplusplus
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#include <errno.h>
+
+#ifndef EINVAL
+#define EINVAL 22
+#endif
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+
+void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
+size_t malloc_size(void* p)              MI_FORWARD1(mi_usable_size,p)
+size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
+void   cfree(void* p)                    MI_FORWARD0(mi_free, p)
+
+int posix_memalign(void** p, size_t alignment, size_t size) {
+  if (alignment % sizeof(void*) != 0) { *p = NULL; return EINVAL; };
+  *p = mi_malloc_aligned(size, alignment);
+  return (*p == NULL ? ENOMEM : 0);
+}
+
+void* memalign(size_t alignment, size_t size) {
+  return mi_malloc_aligned(size, alignment);
+}
+
+void* valloc(size_t size) {
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+void* pvalloc(size_t size) {
+  size_t psize = _mi_os_page_size();
+  if (size >= SIZE_MAX - psize) return NULL; // overflow
+  size_t asize = ((size + psize - 1) / psize) * psize;
+  return mi_malloc_aligned(asize, psize);
+}
+
+void* aligned_alloc(size_t alignment, size_t size) {
+  return mi_malloc_aligned(size, alignment);
+}
+
+void* reallocarray( void* p, size_t count, size_t size ) {  // BSD
+  void* newp = mi_reallocn(p,count,size);
+  if (newp==NULL) errno = ENOMEM;
+  return newp;
+}
+
+#if defined(__GLIBC__) && defined(__linux__)
+  // forward __libc interface (needed for redhat linux)
+  void* __libc_malloc(size_t size)                  MI_FORWARD1(mi_malloc,size)
+  void* __libc_calloc(size_t count, size_t size)    MI_FORWARD2(mi_calloc,count,size)
+  void* __libc_realloc(void* p, size_t size)        MI_FORWARD2(mi_realloc,p,size)
+  void  __libc_free(void* p)                        MI_FORWARD0(mi_free,p)
+  void  __libc_cfree(void* p)                       MI_FORWARD0(mi_free,p)
+
+  void* __libc_memalign(size_t alignment, size_t size)  {
+    return memalign(alignment,size);
+  }
+  void* __libc_valloc(size_t size) {
+    return valloc(size);
+  }
+  void* __libc_pvalloc(size_t size) {
+    return pvalloc(size);
+  }
+  int __posix_memalign(void** p, size_t alignment, size_t size)  {
+    return posix_memalign(p,alignment,size);
+  }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MI_MALLOC_OVERRIDE & !_WIN32
--- a/src/alloc.c
+++ b/src/alloc.c
@ -0,0 +1,445 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+#define MI_IN_ALLOC_C
+#include "alloc-override.c"
+#undef MI_IN_ALLOC_C
+
+// ------------------------------------------------------
+// Allocation
+// ------------------------------------------------------
+
+// Fast allocation in a page: just pop from the free list.
+// Fall back to generic allocation only if the list is empty.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  mi_assert_internal(page->block_size==0||page->block_size >= size);
+  mi_block_t* block = page->free;
+  if (mi_unlikely(block == NULL)) {
+    return _mi_malloc_generic(heap, size); // slow path
+  }
+  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  // pop from the free list
+  page->free = mi_block_next(page,block);
+  page->used++;
+  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+#if (MI_DEBUG)
+  memset(block, MI_DEBUG_UNINIT, size);
+#elif (MI_SECURE)
+  block->next = 0;
+#endif
+#if (MI_STAT>1)
+  if(size <= MI_LARGE_SIZE_MAX) mi_heap_stat_increase(heap,normal[_mi_bin(size)], 1);
+#endif
+  return block;
+}
+
+// allocate a small block
+extern inline void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+  return _mi_page_malloc(heap, page, size);
+}
+
+extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+// zero initialized small block
+void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_small(size);
+  if (p != NULL) { memset(p, 0, size); }
+  return p;
+}
+
+// The main allocation function
+extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  mi_assert(heap!=NULL);
+  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+  void* p;
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    p = mi_heap_malloc_small(heap, size);
+  }
+  else {
+    p = _mi_malloc_generic(heap, size);
+  }
+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase( heap, malloc, mi_good_size(size) );  // overestimate for aligned sizes
+  }
+  #endif
+  return p;
+}
+
+extern inline void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_get_default_heap(), size);
+}
+
+void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
+  void* p = mi_heap_malloc(heap,size);
+  if (zero && p != NULL) memset(p,0,size);
+  return p;
+}
+
+extern inline void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
+}
+
+void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_get_default_heap(),size);
+}
+
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// multi-threaded free
+static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+
+  do {
+    tfreex = tfree = page->thread_free;
+    use_delayed = (tfree.delayed == MI_USE_DELAYED_FREE);
+    if (mi_unlikely(use_delayed)) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex.delayed = MI_DELAYED_FREEING;
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, (mi_block_t*)((uintptr_t)tfree.head << MI_TF_PTR_SHIFT));
+      tfreex.head = (uintptr_t)block >> MI_TF_PTR_SHIFT;
+    }
+  } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex.value, tfree.value));
+
+  if (mi_likely(!use_delayed)) {
+    // increment the thread free count and return
+    mi_atomic_increment(&page->thread_freed);
+  }
+  else {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* heap = page->heap;
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree;
+      do {
+        dfree = (mi_block_t*)heap->thread_delayed_free;
+        mi_block_set_nextx(heap->cookie,block,dfree);
+      } while (!mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, block, dfree));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    do {
+      tfreex = tfree = page->thread_free;
+      tfreex.delayed = MI_NO_DELAYED_FREE;
+    } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex.value, tfree.value));
+  }
+}
+
+
+// regular free
+static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
+{
+  #if (MI_DEBUG)
+  memset(block, MI_DEBUG_FREED, page->block_size);
+  #endif
+
+  // and push it on the free list
+  if (mi_likely(local)) {
+    // owning thread can free a block directly
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if (mi_unlikely(mi_page_all_free(page))) {
+      _mi_page_retire(page);
+    }
+    else if (mi_unlikely(page->flags.in_full)) {
+      _mi_page_unfull(page);
+    }
+  }
+  else {
+    _mi_free_block_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  size_t adjust = (diff % page->block_size);
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+
+static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool local, void* p) {
+  mi_block_t* block = (page->flags.has_aligned ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  _mi_free_block(page, local, block);
+}
+
+// Free a block
+void mi_free(void* p) mi_attr_noexcept
+{
+  // optimize: merge null check with the segment masking (below)
+  //if (p == NULL) return;
+
+#if (MI_DEBUG>0)
+  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
+    _mi_error_message("trying to free an invalid (unaligned) pointer: %p\n", p);
+    return;
+  }
+#endif
+
+  const mi_segment_t* const segment = _mi_ptr_segment(p);
+  if (segment == NULL) return;  // checks for (p==NULL)
+  bool local = (_mi_thread_id() == segment->thread_id);  // preload, note: putting the thread_id in the page->flags does not improve performance
+
+#if (MI_DEBUG>0)
+  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
+    _mi_error_message("trying to mi_free a pointer that does not point to a valid heap space: %p\n", p);
+    return;
+  }
+#endif
+
+  mi_page_t* page = _mi_segment_page_of(segment, p);
+
+#if (MI_STAT>1)
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_heap_stat_decrease( heap, malloc, mi_usable_size(p));
+  if (page->block_size <= MI_LARGE_SIZE_MAX) {
+    mi_heap_stat_decrease( heap, normal[_mi_bin(page->block_size)], 1);
+  }
+  // huge page stat is accounted for in `_mi_page_retire`
+#endif
+
+  // adjust if it might be an un-aligned block
+  if (mi_likely(page->flags.value==0)) {  // note: merging both tests (local | value) does not matter for performance
+    mi_block_t* block = (mi_block_t*)p;
+    if (mi_likely(local)) {
+      // owning thread can free a block directly
+      mi_block_set_next(page, block, page->local_free);  // note: moving this write earlier does not matter for performance
+      page->local_free = block;
+      page->used--;
+      if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }
+    }
+    else {
+      // use atomic operations for a multi-threaded free
+      _mi_free_block_mt(page, block);
+    }
+  }
+  else {
+    // aligned blocks, or a full page; use the more generic path
+    mi_free_generic(segment, page, local, p);
+  }
+}
+
+void _mi_free_delayed_block(mi_block_t* block) {
+  mi_assert_internal(block != NULL);
+  const mi_segment_t* segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* page = _mi_segment_page_of(segment,block);
+  _mi_free_block(page,true,block);
+}
+
+// Bytes available in a block
+size_t mi_usable_size(void* p) mi_attr_noexcept {
+  if (p==NULL) return 0;
+  const mi_segment_t* segment = _mi_ptr_segment(p);
+  const mi_page_t* page = _mi_segment_page_of(segment,p);
+  size_t size = page->block_size;
+  if (mi_unlikely(page->flags.has_aligned)) {
+    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
+    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+    return (size - adjust);
+  }
+  else {
+    return size;
+  }
+}
+
+
+
+
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_malloc,
+};
+
+
+#endif
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+extern inline void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_heap_zalloc(heap,total);
+}
+
+void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_get_default_heap(),count,size);
+}
+
+// Uninitialized `calloc`
+extern void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_heap_malloc(heap, total);
+}
+
+void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+}
+
+// Expand in place or fail
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  if (p == NULL) return NULL;
+  size_t size = mi_usable_size(p);
+  if (newsize > size) return NULL;
+  return p; // it fits
+}
+
+void* _mi_realloc_zero(void* p, size_t newsize, bool zero) {
+  if (p == NULL) return _mi_heap_malloc_zero(mi_get_default_heap(),newsize,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size / 2)) {
+    return p;  // reallocation still fits and not more than 50% waste
+  }
+  void* newp = mi_malloc(newsize); // maybe in another heap
+  if (mi_likely(newp != NULL)) {
+    if (zero && newsize > size) {
+      // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+      size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      memset((uint8_t*)newp + start, 0, newsize - start);
+    }
+    memcpy(newp, p, (newsize > size ? size : newsize));
+    mi_free(p); // only free if succesfull
+  }
+  return newp;
+}
+
+void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_realloc_zero(p,newsize,false);
+}
+
+// Zero initialized reallocation
+void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_realloc_zero(p,newsize,true);
+}
+
+void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_rezalloc(p,total);
+}
+
+void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_realloc(p,total);
+}
+
+// Reallocate but free `p` on errors
+void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_realloc(p,newsize);
+  if (newp==NULL && p!=NULL) mi_free(p);
+  return newp;
+}
+
+// `strdup` using mi_malloc
+char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t n = strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,n+1);
+  if (t != NULL) memcpy(t, s, n + 1);
+  return t;
+}
+
+char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_get_default_heap(), s);
+}
+
+// `strndup` using mi_malloc
+char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t m = strlen(s);
+  if (n > m) n = m;
+  char* t = (char*)mi_heap_malloc(heap, n+1);
+  if (t == NULL) return NULL;
+  memcpy(t, s, n);
+  t[n] = 0;
+  return t;
+}
+
+char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_get_default_heap(),s,n);
+}
+
+// `realpath` using mi_malloc
+#ifdef _WIN32
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#include <windows.h>
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  // todo: use GetFullPathNameW to allow longer file names
+  char buf[PATH_MAX];
+  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
+  if (res == 0) {
+    errno = GetLastError(); return NULL;
+  }
+  else if (res > PATH_MAX) {
+    errno = EINVAL; return NULL;
+  }
+  else if (resolved_name != NULL) {
+    return resolved_name;
+  }
+  else {
+    return mi_heap_strndup(heap, buf, PATH_MAX);
+  }
+}
+#else
+#include <limits.h>
+#ifndef PATH_MAX
+#define PATH_MAX 260
+#endif
+
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  if (resolved_name != NULL) {
+    return realpath(fname,resolved_name);
+  }
+  else {
+    char buf[PATH_MAX+1];
+    char* rname = realpath(fname,buf);
+    return mi_heap_strndup(heap,rname,PATH_MAX); // ok if `rname==NULL`
+  }
+}
+#endif
+
+char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+}
--- a/src/heap.c
+++ b/src/heap.c
@ -0,0 +1,506 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a heap; returns `false` if break was called.
+static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (heap==NULL || heap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = heap->page_count;
+  #endif
+  size_t count = 0;
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(page->heap == heap);
+      count++;
+      if (!fn(heap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>1
+static bool _mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  UNUSED(arg1);
+  UNUSED(arg2);
+  UNUSED(pq);
+  mi_assert_internal(page->heap == heap);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+
+static bool mi_heap_is_valid(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  mi_heap_visit_pages(heap, &_mi_heap_page_is_valid, NULL, NULL);
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  NORMAL,
+  FORCE,
+  ABANDON
+} mi_collect_t;
+
+
+static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  UNUSED(arg2);
+  UNUSED(heap);
+  mi_collect_t collect = (mi_collect_t)arg_collect;
+  _mi_page_free_collect(page);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, free the page. TODO: should we retire here and be less aggressive?
+    _mi_page_free(page, pq, collect != NORMAL);
+  }
+  else if (collect == ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+
+static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+{
+  _mi_deferred_free(heap,collect > NORMAL);
+  if (!mi_heap_is_initialized(heap)) return;
+
+
+  // collect (some) abandoned pages
+  if (collect >= NORMAL && !heap->no_reclaim) {
+    if (collect == NORMAL) {
+      // this may free some segments (but also take ownership of abandoned pages)
+      _mi_segment_try_reclaim_abandoned(heap, false, &heap->tld->segments);
+    }
+    #if MI_DEBUG
+    else if (collect == ABANDON && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+      // the main thread is abandoned, try to free all abandoned segments.
+      // if all memory is freed by now, all segments should be freed.
+      _mi_segment_try_reclaim_abandoned(heap, true, &heap->tld->segments);
+    }
+    #endif
+  }
+
+  // if abandoning, mark all full pages to no longer add to delayed_free
+  if (collect == ABANDON) {
+    for (mi_page_t* page = heap->pages[MI_BIN_FULL].first; page != NULL; page = page->next) {
+      _mi_page_use_delayed_free(page, false);  // set thread_free.delayed to MI_NO_DELAYED_FREE      
+    }
+  }
+
+  // free thread delayed blocks. 
+  // (if abandoning, after this there are no more local references into the pages.)
+  _mi_heap_delayed_free(heap);
+
+  // collect all pages owned by this thread
+  mi_heap_visit_pages(heap, &mi_heap_page_collect, (void*)(collect), NULL);
+  mi_assert_internal( collect != ABANDON || heap->thread_delayed_free == NULL );
+  
+  // collect segment caches
+  if (collect >= FORCE) {
+    _mi_segment_thread_collect(&heap->tld->segments);
+  }
+}
+
+void _mi_heap_collect_abandon(mi_heap_t* heap) {
+  mi_heap_collect_ex(heap, ABANDON);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
+  mi_heap_collect_ex(heap, (force ? FORCE : NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  mi_heap_collect(mi_get_default_heap(), force);
+}
+
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_heap_t* mi_heap_get_default() {
+  mi_thread_init();
+  return mi_get_default_heap();
+}
+
+mi_heap_t* mi_heap_get_backing() {
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_assert_internal(heap!=NULL);
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  mi_assert_internal(bheap!=NULL);
+  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  return bheap;
+}
+
+uintptr_t _mi_heap_random(mi_heap_t* heap) {
+  uintptr_t r = heap->random;
+  heap->random = _mi_random_shuffle(r);
+  return r;
+}
+
+mi_heap_t* mi_heap_new() {  
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);
+  if (heap==NULL) return NULL;
+  memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t));
+  heap->tld = bheap->tld;
+  heap->thread_id = _mi_thread_id();
+  heap->cookie = ((uintptr_t)heap ^ _mi_heap_random(bheap)) | 1;
+  heap->random = _mi_heap_random(bheap);
+  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  return heap;
+}
+
+// zero out the page queues
+static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  // TODO: copy full empty heap instead?
+  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+#ifdef MI_MEDIUM_DIRECT
+  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
+#endif
+  memcpy(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
+  heap->thread_delayed_free = NULL;
+  heap->page_count = 0;
+}
+
+// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
+  
+  // reset default
+  if (mi_heap_is_default(heap)) {
+    _mi_heap_default = heap->tld->heap_backing;
+  }
+  // and free the used memory
+  mi_free(heap);
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+
+static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  UNUSED(arg1);
+  UNUSED(arg2);
+  UNUSED(heap);
+  UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  _mi_page_use_delayed_free(page, false);  
+
+  // stats
+  if (page->block_size > MI_LARGE_SIZE_MAX) {
+    mi_heap_stat_decrease(heap,huge,page->block_size);
+  }
+  #if (MI_STAT>1)
+  size_t inuse = page->used - page->thread_freed;
+  if (page->block_size <= MI_LARGE_SIZE_MAX)  {
+    mi_heap_stat_decrease(heap,normal[_mi_bin(page->block_size)], inuse);
+  }
+  mi_heap_stat_decrease(heap,malloc, page->block_size * inuse);  // todo: off for aligned blocks...
+  #endif
+
+  // pretend it is all free now
+  mi_assert_internal(page->thread_freed<=0xFFFF);
+  page->used = (uint16_t)page->thread_freed;
+
+  // and free the page
+  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+
+  return true; // keep going
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap) {
+  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_reset_pages(heap);
+}
+
+void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert(heap->no_reclaim);
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (!mi_heap_is_initialized(heap)) return;
+  if (!heap->no_reclaim) {
+    // don't free in case it may contain reclaimed pages
+    mi_heap_delete(heap);
+  }
+  else {
+    // free all pages
+    _mi_heap_destroy_pages(heap);
+    mi_heap_free(heap);
+  }
+}
+
+
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Tranfer the pages from one heap to the other
+static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+  mi_assert_internal(heap!=NULL);
+  if (from==NULL || from->page_count == 0) return;
+
+  // unfull all full pages
+  mi_page_t* page = heap->pages[MI_BIN_FULL].first; 
+  while (page != NULL) {
+    mi_page_t* next = page->next;
+    _mi_page_unfull(page);
+    page = next;
+  }
+  mi_assert_internal(heap->pages[MI_BIN_FULL].first == NULL);
+
+  // free outstanding thread delayed free blocks
+  _mi_heap_delayed_free(from);
+
+  // transfer all pages by appending the queues; this will set
+  // a new heap field which is ok as all pages are unfull'd and thus 
+  // other threads won't access this field anymore (see `mi_free_block_mt`)
+  for (size_t i = 0; i < MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_queue_t* append = &from->pages[i];
+    _mi_page_queue_append(heap, pq, append);
+  }
+  mi_assert_internal(from->thread_delayed_free == NULL);
+  
+  // and reset the `from` heap
+  mi_heap_reset_pages(from);
+}
+
+// Safe delete a heap without freeing any still allocated blocks in that heap.
+void mi_heap_delete(mi_heap_t* heap)
+{
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (!mi_heap_is_initialized(heap)) return;
+
+  if (!mi_heap_is_backing(heap)) {
+    // tranfer still used pages to the backing heap
+    mi_heap_absorb(heap->tld->heap_backing, heap);
+  }
+  else {
+    // the backing heap abandons its pages
+    _mi_heap_collect_abandon(heap);
+  }
+  mi_assert_internal(heap->page_count==0);
+  mi_heap_free(heap);
+}
+
+mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  if (!mi_heap_is_initialized(heap)) return NULL;
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  mi_heap_t* old   = _mi_heap_default;
+  _mi_heap_default = heap;
+  return old;
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Analysis
+----------------------------------------------------------- */
+
+// static since it is not thread safe to access heaps from other threads.
+static mi_heap_t* mi_heap_of_block(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(valid);
+  if (mi_unlikely(!valid)) return NULL;
+  return _mi_segment_page_of(segment,p)->heap;
+}
+
+bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (!mi_heap_is_initialized(heap)) return false;
+  return (heap == mi_heap_of_block(p));
+}
+
+
+static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
+  UNUSED(heap);
+  UNUSED(pq);
+  bool* found = (bool*)vfound;
+  mi_segment_t* segment = _mi_page_segment(page);
+  void* start = _mi_page_start(segment, page, NULL);
+  void* end   = (uint8_t*)start + (page->capacity * page->block_size);
+  *found = (p >= start && p < end);
+  return (!*found); // continue if not found
+}
+
+bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (!mi_heap_is_initialized(heap)) return false;
+  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
+  bool found = false;
+  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  return found;
+}
+
+bool mi_check_owned(const void* p) {
+  return mi_heap_check_owned(mi_get_default_heap(), p);
+}
+
+/* -----------------------------------------------------------
+  Visit all heap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all heaps across threads
+----------------------------------------------------------- */
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t*     page;
+} mi_heap_area_ex_t;
+
+static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(xarea != NULL);
+  if (xarea==NULL) return true;
+  const mi_heap_area_t* area = &xarea->area;
+  mi_page_t* page = xarea->page;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page);
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  size_t   psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+
+  if (page->capacity == 1) {
+    // optimize page with one block
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(page->heap, area, pstart, page->block_size, arg);
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
+  memset(free_map, 0, sizeof(free_map));
+
+  size_t free_count = 0;
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    free_count++;
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % page->block_size == 0);
+    size_t blockidx = offset / page->block_size;  // Todo: avoid division?
+    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / sizeof(uintptr_t));
+    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  size_t used_count = 0;
+  for (size_t i = 0; i < page->capacity; i++) {
+    size_t bitidx = (i / sizeof(uintptr_t));
+    size_t bit = i - (bitidx * sizeof(uintptr_t));
+    uintptr_t m = free_map[bitidx];
+    if (bit == 0 && m == UINTPTR_MAX) {
+      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+    }
+    else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      used_count++;
+      uint8_t* block = pstart + (i * page->block_size);
+      if (!visitor(page->heap, area, block, page->block_size, arg)) return false;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
+
+static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  UNUSED(heap);
+  UNUSED(pq);
+  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
+  mi_heap_area_ex_t xarea;
+  xarea.page = page;
+  xarea.area.reserved = page->reserved * page->block_size;
+  xarea.area.committed = page->capacity * page->block_size;
+  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.used  = page->used - page->thread_freed; // race is ok
+  xarea.area.block_size = page->block_size;
+  return fun(heap, &xarea, arg);
+}
+
+// Visit all heap pages as areas
+static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, visitor, arg);
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, arg)) return false;
+  if (args->visit_blocks) {
+    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a heap
+bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+}
+
--- a/src/init.c
+++ b/src/init.c
@ -0,0 +1,434 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memcpy
+
+// Empty page used to initialize the small free pages array
+const mi_page_t _mi_page_empty = {
+  0, false, false, {0}, 
+  0, 0,
+  NULL, 0, 0,   // free, used, cookie
+  NULL, 0, {0},
+  0, NULL, NULL, NULL
+  #if (MI_INTPTR_SIZE==4)
+  , { NULL }
+  #endif
+};
+
+#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
+#define MI_SMALL_PAGES_EMPTY  \
+  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+
+
+// Empty page queues for every bin
+#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define MI_PAGE_QUEUES_EMPTY \
+  { QNULL(1), \
+    QNULL(1), QNULL(2), QNULL(3), QNULL(4), QNULL(5), QNULL(6), QNULL(7), QNULL(8), \
+    QNULL(10), QNULL(12), QNULL(14), QNULL(16), QNULL(20), QNULL(24), QNULL(28), QNULL(32), \
+    QNULL(40), QNULL(48), QNULL(56), QNULL(64), QNULL(80), QNULL(96), QNULL(112), QNULL(128), \
+    QNULL(160), QNULL(192), QNULL(224), QNULL(256), QNULL(320), QNULL(384), QNULL(448), QNULL(512), \
+    QNULL(640), QNULL(768), QNULL(896), QNULL(1024), QNULL(1280), QNULL(1536), QNULL(1792), QNULL(2048), \
+    QNULL(2560), QNULL(3072), QNULL(3584), QNULL(4096), QNULL(5120), QNULL(6144), QNULL(7168), QNULL(8192), \
+    QNULL(10240), QNULL(12288), QNULL(14336), QNULL(16384), QNULL(20480), QNULL(24576), QNULL(28672), QNULL(32768), \
+    QNULL(40960), QNULL(49152), QNULL(57344), QNULL(65536), QNULL(81920), QNULL(98304), QNULL(114688), \
+    QNULL(MI_LARGE_WSIZE_MAX + 1  /*131072, Huge queue */), \
+    QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ }
+
+#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+
+// Empty statistics
+#if MI_STAT>1
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT64(MI_STAT_COUNT_NULL) }
+#else
+#define MI_STAT_COUNT_END_NULL()
+#endif
+
+#define MI_STATS_NULL  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 } \
+  MI_STAT_COUNT_END_NULL()
+
+// --------------------------------------------------------
+// Statically allocate an empty heap as the initial
+// thread local value for the default heap,
+// and statically allocate the backing heap for the main
+// thread so it can function without doing any allocation
+// itself (as accessing a thread local for the first time
+// may lead to allocation itself on some platforms)
+// --------------------------------------------------------
+
+const mi_heap_t _mi_heap_empty = {
+  NULL,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  NULL,
+  0,
+  0,
+  0,
+  0,
+  false
+};
+
+mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+
+
+#define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
+
+static mi_tld_t tld_main = {
+  0,
+  &_mi_heap_main,
+  { { NULL, NULL }, 0, 0, 0, NULL, tld_main_stats }, // segments
+  { 0, NULL, NULL, 0, tld_main_stats },              // os
+  { MI_STATS_NULL }                                  // stats
+};
+
+mi_heap_t _mi_heap_main = {
+  &tld_main,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  NULL,
+  0,
+  0,
+  0,
+  0,
+  false   // can reclaim
+};
+
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+
+mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+
+/* -----------------------------------------------------------
+  Initialization of random numbers
+----------------------------------------------------------- */
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <time.h>
+#endif
+
+uintptr_t _mi_random_shuffle(uintptr_t x) {
+  #if (MI_INTPTR_SIZE==8)
+    // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+  #elif (MI_INTPTR_SIZE==4)
+    // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+  #endif
+  return x;
+}
+
+uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {
+   // Hopefully, ASLR makes our function address random
+  uintptr_t x = (uintptr_t)((void*)&_mi_random_init);
+  x ^= seed;
+  // xor with high res time
+#ifdef _WIN32
+  LARGE_INTEGER pcount;
+  QueryPerformanceCounter(&pcount);
+  x ^= (uintptr_t)(pcount.QuadPart);
+#else
+  struct timespec time;
+  clock_gettime(CLOCK_MONOTONIC, &time);
+  x ^= (uintptr_t)time.tv_sec;
+  x ^= (uintptr_t)time.tv_nsec;
+#endif
+  // and do a few randomization steps
+  uintptr_t max = ((x ^ (x >> 7)) & 0x0F) + 1;
+  for (uintptr_t i = 0; i < max; i++) {
+    x = _mi_random_shuffle(x);
+  }
+  return x;
+}
+
+uintptr_t _mi_ptr_cookie(const void* p) {
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Initialization and freeing of the thread local heaps
+----------------------------------------------------------- */
+
+typedef struct mi_thread_data_s {
+  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_tld_t   tld;
+} mi_thread_data_t;
+
+// Initialize the thread local default heap, called from `mi_thread_init`
+static bool _mi_heap_init() {
+  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (_mi_is_main_thread()) {
+    // the main heap is statically allocated
+    _mi_heap_default = &_mi_heap_main;
+    mi_assert_internal(_mi_heap_default->tld->heap_backing == _mi_heap_default);
+  }
+  else {
+    // use `_mi_os_alloc` to allocate directly from the OS
+    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
+    if (td == NULL) {
+      _mi_error_message("failed to allocate thread local heap memory\n");
+      return false;
+    }
+    mi_tld_t*  tld = &td->tld;
+    mi_heap_t* heap = &td->heap;
+    memcpy(heap, &_mi_heap_empty, sizeof(*heap));
+    heap->thread_id = _mi_thread_id();    
+    heap->random = _mi_random_init(heap->thread_id);
+    heap->cookie = ((uintptr_t)heap ^ _mi_heap_random(heap)) | 1;
+    heap->tld = tld;
+    memset(tld, 0, sizeof(*tld));
+    tld->heap_backing = heap;
+    tld->segments.stats = &tld->stats;
+    tld->os.stats = &tld->stats;
+    _mi_heap_default = heap;
+  }
+  return false;
+}
+
+// Free the thread local default heap (called from `mi_thread_done`)
+static bool _mi_heap_done() {
+  mi_heap_t* heap = _mi_heap_default;
+  if (!mi_heap_is_initialized(heap)) return true;
+
+  // reset default heap
+  _mi_heap_default = (_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+
+  // todo: delete all non-backing heaps?
+
+  // switch to backing heap and free it
+  heap = heap->tld->heap_backing;
+  if (!mi_heap_is_initialized(heap)) return false;
+
+  _mi_stats_done(&heap->tld->stats);
+
+  // free if not the main thread (or in debug mode)
+  if (heap != &_mi_heap_main) {
+    if (heap->page_count > 0) {
+      _mi_heap_collect_abandon(heap);
+    }
+    _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
+  }
+  else if (MI_DEBUG > 0) {
+    _mi_heap_destroy_pages(heap);
+    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+  }
+  return false;
+}
+
+
+
+// --------------------------------------------------------
+// Try to run `mi_thread_done()` automatically so any memory
+// owned by the thread but not yet released can be abandoned
+// and re-owned by another thread.
+//
+// 1. windows dynamic library:
+//     call from DllMain on DLL_THREAD_DETACH
+// 2. windows static library:
+//     use `FlsAlloc` to call a destructor when the thread is done
+// 3. unix, pthreads:
+//     use a pthread key to call a destructor when a pthread is done
+//
+// In the last two cases we also need to call `mi_process_init`
+// to set up the thread local keys.
+// --------------------------------------------------------
+
+#ifndef _WIN32
+#define MI_USE_PTHREADS
+#endif
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // nothing to do as it is done in DllMain
+#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // use thread local storage keys to detect thread ending
+  #include <windows.h>
+  static DWORD mi_fls_key;
+  static void NTAPI mi_fls_done(PVOID value) {
+    if (value!=NULL) mi_thread_done();
+  }
+#elif defined(MI_USE_PTHREADS)
+  // use pthread locol storage keys to detect thread ending
+  #include <pthread.h>
+  static pthread_key_t mi_pthread_key;
+  static void mi_pthread_done(void* value) {
+    if (value!=NULL) mi_thread_done();
+  }
+#else
+  #pragma message("define a way to call mi_thread_done when a thread is done")
+#endif
+
+// Set up handlers so `mi_thread_done` is called automatically
+static void mi_process_setup_auto_thread_done() {
+  static bool tls_initialized = false; // fine if it races
+  if (tls_initialized) return;
+  tls_initialized = true;
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  #elif defined(MI_USE_PTHREADS)
+    pthread_key_create(&mi_pthread_key, &mi_pthread_done);
+  #endif
+}
+
+
+bool _mi_is_main_thread() {
+  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+}
+
+// This is called from the `mi_malloc_generic`
+void mi_thread_init() mi_attr_noexcept
+{
+  // ensure our process has started already
+  mi_process_init();
+
+  // initialize the thread local default heap
+  if (_mi_heap_init()) return;  // returns true if already initialized
+
+  // don't further initialize for the main thread
+  if (_mi_is_main_thread()) return;
+
+  mi_stat_increase(mi_get_default_heap()->tld->stats.threads, 1);
+
+  // set hooks so our mi_thread_done() will be called
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    FlsSetValue(mi_fls_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_fls_done` is called
+  #elif defined(MI_USE_PTHREADS)
+    pthread_setspecific(mi_pthread_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_pthread_done` is called
+  #endif
+
+  _mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+}
+
+void mi_thread_done() mi_attr_noexcept {
+  // stats
+  mi_heap_t* heap = mi_get_default_heap();
+  if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
+    mi_stat_decrease(heap->tld->stats.threads, 1);
+  }
+
+  // abandon the thread local heap
+  if (_mi_heap_done()) return; // returns true if already ran
+
+  if (!_mi_is_main_thread()) {
+    _mi_verbose_message("thread done: 0x%zx\n", _mi_thread_id());
+  }
+}
+
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static void mi_process_done(void);
+
+void mi_process_init() mi_attr_noexcept {
+  // ensure we are called once
+  if (_mi_process_is_initialized) return;
+  _mi_process_is_initialized = true;
+
+  _mi_heap_main.thread_id = _mi_thread_id();
+  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
+  uintptr_t random = _mi_random_init(_mi_heap_main.thread_id);
+  _mi_heap_main.cookie = (uintptr_t)&_mi_heap_main ^ random;
+  _mi_heap_main.random = _mi_random_shuffle(random);
+  #if (MI_DEBUG)
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  atexit(&mi_process_done);
+  mi_process_setup_auto_thread_done();
+  mi_stats_reset();
+}
+
+static void mi_process_done(void) {
+  // only shutdown if we were initialized
+  if (!_mi_process_is_initialized) return;
+  // ensure we are called once
+  static bool process_done = false;
+  if (process_done) return;
+  process_done = true;
+
+  #ifndef NDEBUG
+  mi_collect(true);
+  #endif
+  if (mi_option_is_enabled(mi_option_show_stats) ||
+      mi_option_is_enabled(mi_option_verbose)) {
+    mi_stats_print(NULL);
+  }
+  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+}
+
+
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // Windows DLL: easy to hook into process_init and thread_done
+  #include <Windows.h>
+
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    if (reason==DLL_PROCESS_ATTACH) {
+      mi_process_init();
+    }
+    else if (reason==DLL_THREAD_DETACH) {
+      mi_thread_done();
+    }
+    return TRUE;
+  }
+
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init() {
+    mi_process_init();
+    return (mi_main_thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init() {
+    mi_process_init();
+  }
+
+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_init();
+    return 0;
+  }
+  typedef int(*_crt_cb)(void);
+  #ifdef _M_X64
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  _crt_cb _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
+#else
+#pragma message("define a way to call mi_process_init/done on your platform")
+#endif
--- a/src/options.c
+++ b/src/options.c
@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <stdio.h>
+#include <string.h> // strcmp
+#include <ctype.h>  // toupper
+#include <stdarg.h>
+
+// --------------------------------------------------------
+// Options
+// --------------------------------------------------------
+typedef enum mi_init_e {
+  UNINIT,       // not yet initialized
+  DEFAULTED,    // not found in the environment, use default value
+  INITIALIZED   // found in environment or set explicitly
+} mi_init_t;
+
+typedef struct mi_option_desc_s {
+  long        value;  // the value
+  mi_init_t   init;   // is it initialized yet? (from the environment)
+  const char* name;   // option name without `mimalloc_` prefix
+} mi_option_desc_t;
+
+static mi_option_desc_t options[_mi_option_last] = {
+  { 0, UNINIT, "page_reset" },
+  { 0, UNINIT, "cache_reset" },
+  { 0, UNINIT, "pool_commit" },
+  #if MI_SECURE
+  { MI_SECURE, INITIALIZED, "secure" }, // in secure build the environment setting is ignored
+  #else
+  { 0, UNINIT, "secure" },
+  #endif
+  { 0, UNINIT, "show_stats" },
+  { MI_DEBUG, UNINIT, "show_errors" },
+  { MI_DEBUG, UNINIT, "verbose" }
+};
+
+static void mi_option_init(mi_option_desc_t* desc);
+
+long mi_option_get(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init == UNINIT) {
+    mi_option_init(desc);
+    if (option != mi_option_verbose) {
+      _mi_verbose_message("option '%s': %zd\n", desc->name, desc->value);
+    }
+  }
+  return desc->value;
+}
+
+void mi_option_set(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  desc->value = value;
+  desc->init = INITIALIZED;
+}
+
+void mi_option_set_default(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init != INITIALIZED) {
+    desc->value = value;
+  }
+}
+
+bool mi_option_is_enabled(mi_option_t option) {
+  return (mi_option_get(option) != 0);
+}
+
+void mi_option_enable(mi_option_t option, bool enable) {
+  mi_option_set(option, (enable ? 1 : 0));
+}
+
+void mi_option_enable_default(mi_option_t option, bool enable) {
+  mi_option_set_default(option, (enable ? 1 : 0));
+}
+
+// --------------------------------------------------------
+// Messages
+// --------------------------------------------------------
+
+// Define our own limited `fprintf` that avoids memory allocation.
+// We do this using `snprintf` with a limited buffer.
+static void mi_vfprintf( FILE* out, const char* prefix, const char* fmt, va_list args ) {
+  char buf[256];
+  if (fmt==NULL) return;
+  if (out==NULL) out = stdout;
+  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  if (prefix != NULL) fputs(prefix,out);
+  fputs(buf,out);
+}
+
+void _mi_fprintf( FILE* out, const char* fmt, ... ) {
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(out,NULL,fmt,args);
+  va_end(args);
+}
+
+void _mi_verbose_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(stderr, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_error_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(stderr, "mimalloc: error: ", fmt, args);
+  va_end(args);
+  mi_assert(false);
+}
+
+void _mi_warning_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(stderr, "mimalloc: warning: ", fmt, args);
+  va_end(args);
+}
+
+
+#if MI_DEBUG
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+  _mi_fprintf(stderr,"mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  abort();
+}
+#endif
+
+// --------------------------------------------------------
+// Initialize options by checking the environment
+// --------------------------------------------------------
+
+static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  dest[0] = 0;
+  #pragma warning(suppress:4996)
+  strncpy(dest, src, dest_size - 1);
+  dest[dest_size - 1] = 0;
+}
+
+static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  #pragma warning(suppress:4996)
+  strncat(dest, src, dest_size - 1);
+  dest[dest_size - 1] = 0;
+}
+
+static void mi_option_init(mi_option_desc_t* desc) {
+  desc->init = DEFAULTED;
+  // Read option value from the environment
+  char buf[32];
+  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  mi_strlcat(buf, desc->name, sizeof(buf));
+  #pragma warning(suppress:4996)
+  char* s = getenv(buf);
+  if (s == NULL) {
+    for (size_t i = 0; i < strlen(buf); i++) {
+      buf[i] = toupper(buf[i]);
+    }
+    #pragma warning(suppress:4996)
+    s = getenv(buf);
+  }
+  if (s != NULL) {
+    mi_strlcpy(buf, s, sizeof(buf));
+    for (size_t i = 0; i < strlen(buf); i++) {
+      buf[i] = toupper(buf[i]);
+    }
+    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+      desc->value = 1;
+      desc->init = INITIALIZED;
+    }
+    else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
+      desc->value = 0;
+      desc->init = INITIALIZED;
+    }
+    else {
+      char* end = buf;
+      long value = strtol(buf, &end, 10);
+      if (*end == 0) {
+        desc->value = value;
+        desc->init = INITIALIZED;
+      }
+      else {
+        _mi_warning_message("environment option mimalloc_%s has an invalid value: %s\n", desc->name, buf);
+      }
+    }
+  }
+}
--- a/src/os.c
+++ b/src/os.c
@ -0,0 +1,358 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags are defined
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset
+#include <stdio.h>   // debug fprintf
+#include <errno.h>
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc) and Unix's (mmap).
+  Defines a portable `mmap`, `munmap` and `mmap_trim`.
+----------------------------------------------------------- */
+
+#if defined(_WIN32)
+  #include <windows.h>
+#else
+  #include <sys/mman.h>  // mmap
+  #include <unistd.h>    // sysconf
+#endif
+
+
+uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  uintptr_t x = (sz / alignment) * alignment;
+  if (x < sz) x += alignment;
+  if (x < sz) return 0; // overflow
+  return x;
+}
+
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+static uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  return (sz / alignment) * alignment;
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld);
+
+// cached OS page size
+size_t _mi_os_page_size() {
+  static size_t page_size = 0;
+  if (page_size == 0) {
+#if defined(_WIN32)
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    page_size = (si.dwPageSize > 0 ? si.dwPageSize : 4096);
+#else
+    long result = sysconf(_SC_PAGESIZE);
+    page_size = (result > 0 ? (size_t)result : 4096);
+#endif
+  }
+  return page_size;
+}
+
+
+static void mi_munmap(void* addr, size_t size)
+{
+  if (addr == NULL || size == 0) return;
+  bool err = false;
+#if defined(_WIN32)
+  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+#else
+  err = (munmap(addr, size) == -1);
+#endif
+  if (err) {
+    #pragma warning(suppress:4996)
+    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
+  }
+}
+
+static void* mi_mmap(void* addr, size_t size, int extra_flags, mi_stats_t* stats) {
+  UNUSED(stats);
+  if (size == 0) return NULL;
+  void* p;
+#if defined(_WIN32)
+  p = VirtualAlloc(addr, size, MEM_RESERVE | MEM_COMMIT | extra_flags, PAGE_READWRITE);
+#else
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | extra_flags;
+  if (addr != NULL) {
+    #if defined(MAP_EXCL)
+      flags |= MAP_FIXED | MAP_EXCL;  // BSD
+    #elif defined(MAP_FIXED_NOREPLACE)
+      flags |= MAP_FIXED_NOREPLACE;   // Linux
+    #elif defined(MAP_FIXED)
+      flags |= MAP_FIXED;
+    #endif
+  }
+  p = mmap(addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0);
+  if (p == MAP_FAILED) p = NULL;
+  if (addr != NULL && p != addr) {
+    mi_munmap(p, size);
+    p = NULL;
+  }
+#endif
+  mi_assert(p == NULL || (addr == NULL && p != addr) || (addr != NULL && p == addr));
+  if (p != NULL) mi_stat_increase(stats->mmap_calls, 1);
+  return p;
+}
+
+
+static void* mi_os_page_align_region(void* addr, size_t size, size_t* newsize) {
+  mi_assert(addr != NULL && size > 0);
+  if (newsize != NULL) *newsize = 0;
+  if (size == 0 || addr == NULL) return NULL;
+
+  // page align conservatively within the range
+  void* start = mi_align_up_ptr(addr, _mi_os_page_size());
+  void* end = mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size());
+  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
+  if (diff <= 0) return NULL;
+
+  mi_assert_internal((size_t)diff <= size);
+  if (newsize != NULL) *newsize = (size_t)diff;  
+  return start;
+}
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+bool _mi_os_reset(void* addr, size_t size) {
+  // page align conservatively within the range
+  size_t csize;
+  void* start = mi_os_page_align_region(addr,size,&csize);
+  if (csize==0) return true;
+
+#if defined(_WIN32)
+  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
+  mi_assert(p == start);
+  return (p == start);
+#else
+  #if defined(MADV_FREE)
+    static int advice = MADV_FREE;
+    int err = madvise(start, csize, advice);
+    if (err!=0 && errno==EINVAL && advice==MADV_FREE) {
+      // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+      advice = MADV_DONTNEED;
+      err = madvise(start, csize, advice);
+    }
+  #else
+    int err = madvise(start, csize, MADV_DONTNEED);
+  #endif
+  if (err != 0) {
+    _mi_warning_message("madvise reset error: start: 0x%8p, csize: 0x%8zux, errno: %i\n", start, csize, errno);
+  }
+  //mi_assert(err == 0);
+  return (err == 0);
+#endif
+}
+
+// Protect a region in memory to be not accessible.
+static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* start = mi_os_page_align_region(addr, size, &csize);
+  if (csize==0) return false;
+
+  int err = 0;
+#ifdef _WIN32
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(start,csize,protect ? PAGE_NOACCESS : PAGE_READWRITE,&oldprotect);
+  err = (ok ? 0 : -1);
+#else
+  err = mprotect(start,csize,protect ? PROT_NONE : (PROT_READ|PROT_WRITE));  
+#endif
+  if (err != 0) {
+    _mi_warning_message("mprotect error: start: 0x%8p, csize: 0x%8zux, errno: %i\n", start, csize, errno);
+  }
+  return (err==0);
+}
+
+bool _mi_os_protect(void* addr, size_t size) {
+  return mi_os_protectx(addr,size,true);
+}
+
+bool _mi_os_unprotect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, false);
+}
+
+/* -----------------------------------------------------------
+  OS allocation using mmap/munmap
+----------------------------------------------------------- */
+
+void* _mi_os_alloc(size_t size, mi_stats_t* stats) {
+  if (size == 0) return NULL;
+  void* p = mi_mmap(NULL, size, 0, stats);
+  mi_assert(p!=NULL);
+  if (p != NULL) mi_stat_increase(stats->reserved, size);
+  return p;
+}
+
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_munmap(p, size);
+  mi_stat_decrease(stats->reserved, size);
+}
+
+// Slow but guaranteed way to allocated aligned memory
+// by over-allocating and then reallocating at a fixed aligned
+// address that should be available then.
+static void* mi_os_alloc_aligned_ensured(size_t size, size_t alignment, size_t trie, mi_stats_t* stats)
+{
+  if (trie >= 3) return NULL; // stop recursion (only on Windows)
+  size_t alloc_size = size + alignment;
+  mi_assert(alloc_size >= size); // overflow?
+  if (alloc_size < size) return NULL;
+
+  // allocate a chunk that includes the alignment
+  void* p = mi_mmap(NULL, alloc_size, 0, stats);
+  if (p == NULL) return NULL;
+  // create an aligned pointer in the allocated area
+  void* aligned_p = mi_align_up_ptr(p, alignment);
+  mi_assert(aligned_p != NULL);
+#if defined(_WIN32)
+  // free it and try to allocate `size` at exactly `aligned_p`
+  // note: this may fail in case another thread happens to VirtualAlloc
+  // concurrently at that spot. We try up to 3 times to mitigate this.
+  mi_munmap(p, alloc_size);
+  p = mi_mmap(aligned_p, size, 0, stats);
+  if (p != aligned_p) {
+    if (p != NULL) mi_munmap(p, size);
+    return mi_os_alloc_aligned_ensured(size, alignment, trie++, stats);
+  }
+#else
+  // we selectively unmap parts around the over-allocated area.
+  size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+  size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+  size_t post_size = alloc_size - pre_size - mid_size;
+  if (pre_size > 0)  mi_munmap(p, pre_size);
+  if (post_size > 0) mi_munmap((uint8_t*)aligned_p + mid_size, post_size);
+#endif
+
+  mi_assert(((uintptr_t)aligned_p) % alignment == 0);
+  return aligned_p;
+}
+
+// Allocate an aligned block.
+// Since `mi_mmap` is relatively slow we try to allocate directly at first and
+// hope to get an aligned address; only when that fails we fall back
+// to a guaranteed method by overallocating at first and adjusting.
+// TODO: use VirtualAlloc2 with alignment on Windows 10 / Windows Server 2016.
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, mi_os_tld_t* tld)
+{
+  if (size == 0) return NULL;
+  if (alignment < 1024) return _mi_os_alloc(size, tld->stats);
+
+  void* p = os_pool_alloc(size,alignment,tld);
+  if (p != NULL) return p;
+
+  void* suggest = NULL;
+
+#if defined(MAP_ALIGNED)
+  // on BSD, use the aligned mmap api
+  size_t n = _mi_bsr(alignment);
+  if ((size_t)1 << n == alignment && n >= 12) {  // alignment is a power of 2 and >= 4096
+    p = mi_mmap(suggest, size, MAP_ALIGNED(n));     // use the freeBSD aligned flags
+  }
+#endif
+  if (p==NULL && (tld->mmap_next_probable % alignment) == 0) {
+    // if the next probable address is aligned,
+    // then try to just allocate `size` and hope it is aligned...
+    p = mi_mmap(suggest, size, 0, tld->stats);
+    if (p == NULL) return NULL;
+    if (((uintptr_t)p % alignment) == 0) mi_stat_increase(tld->stats->mmap_right_align, 1);
+  }
+  //fprintf(stderr, "segment address guess: %s, p=%lxu, guess:%lxu\n", (p != NULL && (uintptr_t)p % alignment ==0 ? "correct" : "incorrect"), (uintptr_t)p, next_probable);
+  
+  if (p==NULL || ((uintptr_t)p % alignment) != 0) {
+    // if `p` is not yet aligned after all, free the block and use a slower
+    // but guaranteed way to allocate an aligned block
+    if (p != NULL) mi_munmap(p, size);
+    mi_stat_increase( tld->stats->mmap_ensure_aligned, 1);
+    //fprintf(stderr, "mimalloc: slow mmap 0x%lx\n", _mi_thread_id());
+    p = mi_os_alloc_aligned_ensured(size, alignment,0,tld->stats);
+  }
+  if (p != NULL) {
+    mi_stat_increase( tld->stats->reserved, size);
+
+    // next probable address is the page-aligned address just after the newly allocated area.
+    const size_t alloc_align =
+#if defined(_WIN32)
+      64 * 1024; // Windows allocates 64kb aligned
+#else
+      _mi_os_page_size(); // page size on other OS's
+#endif
+    size_t probable_size = MI_SEGMENT_SIZE;
+    if (tld->mmap_previous > p) {
+      // Linux tends to allocate downward
+      tld->mmap_next_probable = _mi_align_down((uintptr_t)p - probable_size, alloc_align); // ((uintptr_t)previous - (uintptr_t)p);
+    }
+    else {
+      // Otherwise, guess the next address is page aligned `size` from current pointer
+      tld->mmap_next_probable = _mi_align_up((uintptr_t)p + probable_size, alloc_align);
+    }
+    tld->mmap_previous = p;
+  }
+  return p;
+}
+
+// Pooled allocation: on 64-bit systems with plenty
+// of virtual addresses, we allocate 10 segments at the
+// time to minimize `mmap` calls and increase aligned
+// allocations. This is only good on systems that
+// do overcommit so we put it behind the `MIMALLOC_POOL_COMMIT` option.
+// For now, we disable it on windows as VirtualFree must
+// be called on the original allocation and cannot be called
+// for individual fragments.
+#if !defined(_WIN32) || (MI_INTPTR_SIZE<8)
+
+static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld) {
+  UNUSED(size);
+  UNUSED(alignment);
+  UNUSED(tld);
+  return NULL;
+}
+
+#else
+
+#define MI_POOL_ALIGNMENT   MI_SEGMENT_SIZE
+#define MI_POOL_SIZE        (10*MI_POOL_ALIGNMENT)
+
+static void* os_pool_alloc(size_t size, size_t alignment, mi_os_tld_t* tld)
+{
+  if (!mi_option_is_enabled(mi_option_pool_commit)) return NULL;
+  if (alignment != MI_POOL_ALIGNMENT) return NULL;
+  size = _mi_align_up(size,MI_POOL_ALIGNMENT);
+  if (size > MI_POOL_SIZE) return NULL;
+
+  if (tld->pool_available == 0) {
+    tld->pool = (uint8_t*)mi_os_alloc_aligned_ensured(MI_POOL_SIZE,MI_POOL_ALIGNMENT,0,tld->stats);
+    if (tld->pool == NULL) return NULL;
+    tld->pool_available += MI_POOL_SIZE;
+  }
+
+  if (size > tld->pool_available) return NULL;
+  void* p = tld->pool;
+  tld->pool_available -= size;
+  tld->pool += size;
+  return p;
+}
+
+#endif
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -0,0 +1,352 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#ifndef MI_IN_PAGE_C
+#error "this file should be included from 'page.c'"
+#endif
+
+/* -----------------------------------------------------------
+  Minimal alignment in machine words (i.e. `sizeof(void*)`)
+----------------------------------------------------------- */
+
+#if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE)
+  #error "define aligment for more than 4x word size for this platform"
+#elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE)
+  #define MI_ALIGN4W   // 4 machine words minimal alignment
+#elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE)
+  #define MI_ALIGN2W   // 2 machine words minimal alignment
+#else
+  // ok, default aligment is 1 word
+#endif
+
+
+/* -----------------------------------------------------------
+  Queue query
+----------------------------------------------------------- */
+
+
+static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_LARGE_SIZE_MAX+sizeof(uintptr_t)));
+}
+
+static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_LARGE_SIZE_MAX+(2*sizeof(uintptr_t))));
+}
+
+static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
+  return (pq->block_size > MI_LARGE_SIZE_MAX);
+}
+
+/* -----------------------------------------------------------
+  Bins
+----------------------------------------------------------- */
+
+// Bit scan reverse: return the index of the highest bit.
+static inline uint8_t mi_bsr32(uint32_t x);
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+static inline uint8_t mi_bsr32(uint32_t x) {
+  uint32_t idx;
+  _BitScanReverse(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static inline uint8_t mi_bsr32(uint32_t x) {
+  return (31 - __builtin_clz(x));
+}
+#else
+static inline uint8_t mi_bsr32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
+     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
+  };
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return debruijn[(x*0x076be629) >> 27];
+}
+#endif
+
+// Bit scan reverse: return the index of the highest bit.
+uint8_t _mi_bsr(uintptr_t x) {
+  if (x == 0) return 0;
+#if MI_INTPTR_SIZE==8
+  uint32_t hi = (x >> 32);
+  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
+#elif MI_INTPTR_SIZE==4
+  return mi_bsr32(x);
+#else
+# error "define bsr for non-32 or 64-bit platforms"
+#endif
+}
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+inline uint8_t _mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  #if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+  #endif
+  else if (wsize > MI_LARGE_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    #if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+    #endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = mi_bsr32((uint32_t)wsize);
+    // and use the top 3 bits to determine the bin (~16% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+  }
+  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+
+
+/* -----------------------------------------------------------
+  Queue of pages with free blocks
+----------------------------------------------------------- */
+
+size_t _mi_bin_size(uint8_t bin) {
+  return _mi_heap_empty.pages[bin].block_size;
+}
+
+// Good size for allocation
+size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_SIZE_MAX) {
+    return _mi_bin_size(_mi_bin(size));
+  }
+  else {
+    return _mi_align_up(size,_mi_os_page_size());
+  }
+}
+
+#if (MI_DEBUG>1)
+static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_page_t* list = queue->first;
+  while (list != NULL) {
+    mi_assert_internal(list->next == NULL || list->next->prev == list);
+    mi_assert_internal(list->prev == NULL || list->prev->next == list);
+    if (list == page) break;
+    list = list->next;
+  }
+  return (list == page);
+}
+
+#endif
+
+#if (MI_DEBUG>1)
+static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
+  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+}
+#endif
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  uint8_t bin = (page->flags.in_full ? MI_BIN_FULL : _mi_bin(page->block_size));
+  mi_heap_t* heap = page->heap;
+  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->block_size == pq->block_size);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  return pq;
+}
+
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  uint8_t bin = (page->flags.in_full ? MI_BIN_FULL : _mi_bin(page->block_size));
+  mi_assert_internal(bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(page->flags.in_full || page->block_size == pq->block_size);
+  return pq;
+}
+
+// The current small page array is for efficiency and for each
+// small size (up to 256) it points directly to the page for that
+// size without having to compute the bin. This means when the
+// current free page queue is updated for a small bin, we need to update a
+// range of entries in `_mi_page_small_free`.
+static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  size_t size = pq->block_size;
+  if (size > MI_SMALL_SIZE_MAX) return;
+
+  mi_page_t* page = pq->first;
+  if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty;
+
+  // find index in the right direct page array
+  size_t start;
+  size_t idx = _mi_wsize_from_size(size);
+  mi_page_t** pages_free = heap->pages_free_direct;
+
+  if (pages_free[idx] == page) return;  // already set
+
+  // find start slot
+  if (idx<=1) {
+    start = 0;
+  }
+  else {
+    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
+    uint8_t bin = _mi_bin(size);
+    const mi_page_queue_t* prev = pq - 1;
+    while( bin == _mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+      prev--;
+    }
+    start = 1 + _mi_wsize_from_size(prev->block_size);
+    if (start > idx) start = idx;
+  }
+
+  // set size range to the right page
+  mi_assert(start <= idx);
+  for (size_t sz = start; sz <= idx; sz++) {
+    pages_free[sz] = page;
+  }
+}
+
+/*
+static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue))  || (page->flags.in_full && mi_page_queue_is_full(queue)));
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == queue->last)  queue->last = page->prev;
+  if (page == queue->first) {
+    queue->first = page->next;
+    // update first
+    mi_heap_t* heap = page->heap;
+    mi_assert_internal(mi_heap_contains_queue(heap, queue));
+    mi_heap_queue_first_update(heap,queue);
+  }
+  page->heap->page_count--;
+  page->next = NULL;
+  page->prev = NULL;
+  page->heap = NULL;
+  page->flags.in_full = false;
+}
+
+
+static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page->heap == NULL);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue)) || (page->flags.in_full && mi_page_queue_is_full(queue)));
+
+  page->flags.in_full = mi_page_queue_is_full(queue);
+  page->heap = heap;
+  page->next = queue->first;
+  page->prev = NULL;
+  if (queue->first != NULL) {
+    mi_assert_internal(queue->first->prev == NULL);
+    queue->first->prev = page;
+    queue->first = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  mi_heap_queue_first_update(heap, queue);
+  heap->page_count++;
+}
+
+
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(from, page));
+  mi_assert_expensive(!mi_page_queue_contains(to, page));
+  mi_assert_internal(page->block_size == to->block_size ||
+                     (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                      (page->block_size == from->block_size && mi_page_queue_is_full(to)));
+
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == from->last)  from->last = page->prev;
+  if (page == from->first) {
+    from->first = page->next;
+    // update first
+    mi_heap_t* heap = page->heap;
+    mi_assert_internal(mi_heap_contains_queue(heap, from));
+    mi_heap_queue_first_update(heap, from);
+  }
+
+  page->prev = to->last;
+  page->next = NULL;
+  if (to->last != NULL) {
+    mi_assert_internal(page->heap == to->last->heap);
+    to->last->next = page;
+    to->last = page;
+  }
+  else {
+    to->first = page;
+    to->last = page;
+    mi_heap_queue_first_update(page->heap, to);
+  }
+
+  page->flags.in_full = mi_page_queue_is_full(to);
+}
+
+void _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  mi_assert_internal(pq->block_size == append->block_size);
+
+  if (append->first==NULL) return;
+
+  // set append pages to new heap
+  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
+    page->heap = heap;
+  }
+
+  if (pq->last==NULL) {
+    // take over afresh
+    mi_assert_internal(pq->first==NULL);
+    pq->first = append->first;
+    pq->last = append->last;
+    mi_heap_queue_first_update(heap, pq);
+  }
+  else {
+    // append to end
+    mi_assert_internal(pq->last!=NULL);
+    mi_assert_internal(append->first!=NULL);
+    pq->last->next = append->first;
+    append->first->prev = pq->last;
+    pq->last = append->last;
+  }
+}
--- a/src/page.c
+++ b/src/page.c
@ -0,0 +1,710 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The core of the allocator. Every segment contains
+  pages of a certain block size. The main function
+  exported is `mi_malloc_generic`.
+----------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#define MI_IN_PAGE_C
+#include "page-queue.c"
+#undef MI_IN_PAGE_C
+
+
+/* -----------------------------------------------------------
+  Page helpers
+----------------------------------------------------------- */
+
+// Index a block in a page
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t i) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(i <= page->reserved);
+  return (mi_block_t*)((uint8_t*)page_start + (i * page->block_size));
+}
+
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_stats_t* stats);
+
+
+#if (MI_DEBUG>1)
+static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  size_t count = 0;
+  while (head != NULL) {
+    mi_assert_internal(page == _mi_ptr_page(head));
+    count++;
+    head = mi_block_next(page, head);
+  }
+  return count;
+}
+
+// Start of the page available memory
+static inline uint8_t* mi_page_area(const mi_page_t* page) {
+  return _mi_page_start(_mi_page_segment(page), page, NULL);
+}
+
+
+static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
+  size_t psize;
+  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  mi_block_t* start = (mi_block_t*)page_area;
+  mi_block_t* end   = (mi_block_t*)(page_area + psize);
+  while(p != NULL) {
+    if (p < start || p >= end) return false;
+    p = mi_block_next(page, p);
+  }
+  return true;
+}
+
+static bool mi_page_is_valid_init(mi_page_t* page) {
+  mi_assert_internal(page->block_size > 0);
+  mi_assert_internal(page->used <= page->capacity);
+  mi_assert_internal(page->capacity <= page->reserved);
+  
+  mi_segment_t* segment = _mi_page_segment(page);
+  uint8_t* start = _mi_page_start(segment,page,NULL);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
+  mi_assert_internal(mi_page_list_is_valid(page,page->free));
+  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
+
+  mi_block_t* tfree = (mi_block_t*)((uintptr_t)page->thread_free.head << MI_TF_PTR_SHIFT);
+  mi_assert_internal(mi_page_list_is_valid(page, tfree));
+  size_t tfree_count = mi_page_list_count(page, tfree);
+  mi_assert_internal(tfree_count <= page->thread_freed + 1);
+
+  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
+  mi_assert_internal(page->used + free_count == page->capacity);
+
+  return true;
+}
+
+bool _mi_page_is_valid(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_valid_init(page));
+  mi_assert_internal(page->cookie != 0);
+  if (page->heap!=NULL) {
+    mi_segment_t* segment = _mi_page_segment(page);
+    mi_assert_internal(segment->thread_id == page->heap->thread_id);
+    mi_page_queue_t* pq = mi_page_queue_of(page);
+    mi_assert_internal(mi_page_queue_contains(pq, page));
+    mi_assert_internal(pq->block_size==page->block_size || page->block_size > MI_LARGE_SIZE_MAX || page->flags.in_full);
+    mi_assert_internal(mi_heap_contains_queue(page->heap,pq));
+  }
+  return true;
+}
+#endif
+
+
+void _mi_page_use_delayed_free(mi_page_t* page, bool enable) {
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+
+  do {
+    tfreex = tfree = page->thread_free;
+    tfreex.delayed = (enable ? MI_USE_DELAYED_FREE : MI_NO_DELAYED_FREE);
+    if (mi_unlikely(tfree.delayed == MI_DELAYED_FREEING)) {
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      continue;          // and try again
+    }
+  }
+  while(tfreex.delayed != tfree.delayed && // avoid atomic operation if already equal
+        !mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex.value, tfree.value));
+}
+
+
+/* -----------------------------------------------------------
+  Page collect the `local_free` and `thread_free` lists
+----------------------------------------------------------- */
+
+// Collect the local `thread_free` list using an atomic exchange.
+// Note: The exchange must be done atomically as this is used right after
+// moving to the full list in `mi_page_collect_ex` and we need to
+// ensure that there was no race where the page became unfull just before the move.
+static void mi_page_thread_free_collect(mi_page_t* page)
+{
+  mi_block_t* head;
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+  do {
+    tfreex = tfree = page->thread_free;
+    head = (mi_block_t*)((uintptr_t)tfree.head << MI_TF_PTR_SHIFT);
+    tfreex.head = 0;
+  } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex.value, tfree.value));
+
+  // return if the list is empty
+  if (head == NULL) return;
+
+  // find the tail
+  uint16_t count = 1;
+  mi_block_t* tail = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page,tail)) != NULL) {
+    count++;
+    tail = next;
+  }
+
+  // and prepend to the free list
+  mi_block_set_next(page,tail, page->free);
+  page->free = head;
+
+  // update counts now
+  mi_atomic_subtract(&page->thread_freed, count);
+  page->used -= count;
+}
+
+void _mi_page_free_collect(mi_page_t* page) {
+  mi_assert_internal(page!=NULL);
+  //if (page->free != NULL) return; // avoid expensive append
+
+  // free the local free list
+  if (page->local_free != NULL) {
+    if (mi_likely(page->free == NULL)) {  
+      // usual caes
+      page->free = page->local_free;
+    }
+    else {
+      mi_block_t* tail = page->free;
+      mi_block_t* next;
+      while ((next = mi_block_next(page, tail)) != NULL) { 
+        tail = next;  
+      }
+      mi_block_set_next(page, tail, page->local_free);
+    }
+    page->local_free = NULL;   
+  }
+  // and the thread free list
+  if (page->thread_free.head != 0) {  // quick test to avoid an atomic operation
+    mi_page_thread_free_collect(page);
+  }
+}
+
+
+
+/* -----------------------------------------------------------
+  Page fresh and retire
+----------------------------------------------------------- */
+
+// called from segments when reclaiming abandoned pages
+void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  mi_assert_internal(page->heap == NULL);
+  _mi_page_free_collect(page);
+  mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+// allocate a fresh page from a segment
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = _mi_segment_page_alloc(block_size, &heap->tld->segments, &heap->tld->os);
+  if (page == NULL) return NULL;
+  mi_page_init(heap, page, block_size, &heap->tld->stats);
+  mi_heap_stat_increase( heap, pages, 1);
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return page;
+}
+
+// Get a fresh page to use
+static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+
+  // try to reclaim an abandoned page first
+  mi_page_t* page = pq->first;
+  if (!heap->no_reclaim &&
+      _mi_segment_try_reclaim_abandoned(heap, false, &heap->tld->segments) &&
+      page != pq->first)
+  {
+    // we reclaimed, and we got lucky with a reclaimed page in our queue
+    page = pq->first;
+    if (page->free != NULL) return page;
+  }
+  // otherwise allocate the page
+  page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+  if (page==NULL) return NULL;
+  mi_assert_internal(pq->block_size==page->block_size);
+  mi_assert_internal(pq==mi_page_queue(heap,page->block_size));
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Do any delayed frees
+   (put there by other threads if they deallocated in a full page)
+----------------------------------------------------------- */
+void _mi_heap_delayed_free(mi_heap_t* heap) {
+  // take over the list
+  mi_block_t* block;
+  do {
+    block = (mi_block_t*)heap->thread_delayed_free;
+  } while (block != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, NULL, block));
+
+  // and free them all
+  while(block != NULL) {
+    mi_block_t* next = mi_block_nextx(heap->cookie,block);
+    // use internal free instead of regular one to keep stats etc correct
+    _mi_free_delayed_block(block);
+    block = next;
+  }
+}
+
+/* -----------------------------------------------------------
+  Unfull, abandon, free and retire
+----------------------------------------------------------- */
+
+// Move a page from the full list back to a regular list
+void _mi_page_unfull(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(page->flags.in_full);
+
+  _mi_page_use_delayed_free(page, false);
+  if (!page->flags.in_full) return;
+
+  mi_heap_t* heap = page->heap;
+  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  page->flags.in_full = false; // to get the right queue
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  page->flags.in_full = true;
+  mi_page_queue_enqueue_from(pq, pqfull, page);
+}
+
+static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(!mi_page_immediate_available(page));
+  mi_assert_internal(!page->flags.in_full);
+
+  _mi_page_use_delayed_free(page, true);
+  if (page->flags.in_full) return;
+
+  mi_page_queue_enqueue_from(&page->heap->pages[MI_BIN_FULL], pq, page);
+  mi_page_thread_free_collect(page);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+}
+
+
+// Abandon a page with used blocks at the end of a thread.
+// Note: only call if it is ensured that no references exist from
+// the `page->heap->thread_delayed_free` into this page.
+// Currently only called through `mi_heap_collect_ex` which ensures this.
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(page->heap != NULL);
+  mi_assert_internal(page->thread_free.delayed == MI_NO_DELAYED_FREE);
+
+#if MI_DEBUG>1
+  // check there are no references left..
+  for (mi_block_t* block = (mi_block_t*)page->heap->thread_delayed_free; block != NULL; block = mi_block_nextx(page->heap->cookie,block)) {
+    mi_assert_internal(_mi_ptr_page(block) != page);
+  }
+#endif
+
+  // and then remove from our page list
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and abandon it
+  mi_assert_internal(page->heap == NULL);
+  _mi_segment_page_abandon(page,segments_tld);
+}
+
+
+// Free a page with no more free blocks
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(page->thread_free.delayed != MI_DELAYED_FREEING);
+
+  page->flags.has_aligned = false;
+
+  // account for huge pages here
+  if (page->block_size > MI_LARGE_SIZE_MAX) {
+    mi_heap_stat_decrease(page->heap, huge, page->block_size);
+  }
+
+  // remove from the page list
+  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and free it
+  mi_assert_internal(page->heap == NULL);
+  _mi_segment_page_free(page, force, segments_tld);
+}
+
+// Retire a page with no more used blocks
+// Important to not retire too quickly though as new
+// allocations might coming.
+// Note: called from `mi_free` and benchmarks often
+// trigger this due to freeing everything and then
+// allocating again so careful when changing this.
+void _mi_page_retire(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_all_free(page));
+
+  page->flags.has_aligned = false;
+
+  // don't retire too often..
+  // (or we end up retiring and re-allocating most of the time)
+  // NOTE: refine this more: we should not retire if this
+  // is the only page left with free blocks. It is not clear
+  // how to check this efficiently though... for now we just check
+  // if its neighbours are almost fully used.
+  if (mi_likely(page->block_size <= MI_LARGE_SIZE_MAX)) {
+    if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
+      return; // dont't retire after all
+    }
+  }
+
+  _mi_page_free(page, mi_page_queue_of(page), false);
+}
+
+
+/* -----------------------------------------------------------
+  Initialize the initial free list in a page.
+  In secure mode we initialize a randomized list by 
+  alternating between slices.
+----------------------------------------------------------- */
+
+#define MI_MAX_SLICE_SHIFT  (6)   // at most 64 slices
+#define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
+#define MI_MIN_SLICES       (2)
+
+static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats)
+{
+  UNUSED(stats);
+  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  size_t bsize = page->block_size;
+  mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);  
+  if (extend < MI_MIN_SLICES || !mi_option_is_enabled(mi_option_secure)) {
+    // initialize a sequential free list
+    mi_block_t* end = mi_page_block_at(page, page_area, page->capacity + extend - 1);
+    mi_block_t* block = start;
+    for (size_t i = 0; i < extend; i++) {
+      mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+      mi_block_set_next(page,block,next);
+      block = next;
+    }
+    mi_block_set_next(page, end, NULL);
+    page->free = start;
+  }
+  else {
+    // initialize a randomized free list
+    // set up `slice_count` slices to alternate between
+    size_t shift  = MI_MAX_SLICE_SHIFT;
+    while ((extend >> shift) == 0) {
+      shift--; 
+    }
+    size_t slice_count = (size_t)1U << shift;
+    size_t slice_extend = extend / slice_count;
+    mi_assert_internal(slice_extend >= 1);
+    mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+    size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+    for (size_t i = 0; i < slice_count; i++) {
+      blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend);      
+      counts[i] = slice_extend;
+    }
+    counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+    // and initialize the free list by randomly threading through them    
+    // set up first element
+    size_t current = _mi_heap_random(heap) % slice_count;
+    counts[current]--;
+    page->free = blocks[current];
+    // and iterate through the rest
+    uintptr_t rnd = heap->random;
+    for (size_t i = 1; i < extend; i++) {
+      // call random_shuffle only every INTPTR_SIZE rounds
+      size_t round = i%MI_INTPTR_SIZE;
+      if (round == 0) rnd = _mi_random_shuffle(rnd);
+      // select a random next slice index
+      size_t next = ((rnd >> 8*round) & (slice_count-1));  
+      while (counts[next]==0) {                            // ensure it still has space
+        next++;
+        if (next==slice_count) next = 0;
+      }
+      // and link the current block to it
+      counts[next]--; 
+      mi_block_t* block = blocks[current];
+      blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+      mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next` 
+      current = next;
+    }
+    mi_block_set_next( page, blocks[current], NULL);             // end of the list
+    heap->random = _mi_random_shuffle(rnd);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  mi_stat_increase(stats->committed, extend * page->block_size);
+}
+
+/* -----------------------------------------------------------
+  Page initialize and extend the capacity
+----------------------------------------------------------- */
+
+#define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
+#if MI_SECURE
+#define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many 
+#else
+#define MI_MIN_EXTEND         (1)
+#endif
+
+// Extend the capacity (up to reserved) by initializing a free list
+// We do at most `MI_MAX_EXTEND` to avoid touching too much memory
+// Note: we also experimented with "bump" allocation on the first
+// allocations but this did not speed up any benchmark (due to an
+// extra test in malloc? or cache effects?)
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_assert(page->free == NULL);
+  mi_assert(page->local_free == NULL);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  if (page->free != NULL) return;
+  if (page->capacity >= page->reserved) return;
+
+  size_t page_size;
+  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  if (page->is_reset) {
+    page->is_reset = false;
+    mi_stat_decrease( stats->reset, page_size);
+  }
+
+  mi_stat_increase( stats->pages_extended, 1);
+
+  // calculate the extend count
+  size_t extend = page->reserved - page->capacity;  
+  size_t max_extend = MI_MAX_EXTEND_SIZE/page->block_size;
+  if (max_extend < MI_MIN_EXTEND) max_extend = MI_MIN_EXTEND;
+  
+  if (extend > max_extend) {
+    // ensure we don't touch memory beyond the page to reduce page commit.
+    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
+    extend = (max_extend==0 ? 1 : max_extend);
+  }
+
+  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
+  mi_assert_internal(extend < (1UL<<16));
+
+  // and append the extend the free list
+  mi_page_free_list_extend(heap, page, extend, stats );
+
+  mi_assert_expensive(mi_page_is_valid_init(page));
+}
+
+// Initialize a fresh page
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_stats_t* stats) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert(segment != NULL);
+  // set fields
+  size_t page_size;
+  _mi_segment_page_start(segment, page, &page_size);
+  page->block_size = block_size;
+  mi_assert_internal(block_size>0);
+  mi_assert_internal(page_size / block_size < (1L<<16));
+  page->reserved = (uint16_t)(page_size / block_size);
+  page->cookie = _mi_heap_random(heap) | 1;  
+
+  mi_assert_internal(page->capacity == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->thread_free.value == 0);
+  mi_assert_internal(page->thread_freed == 0);
+  mi_assert_internal(page->next == NULL);
+  mi_assert_internal(page->prev == NULL);
+  mi_assert_internal(page->flags.has_aligned == false);
+  mi_assert_internal(page->cookie != 0);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  // initialize an initial free list
+  mi_page_extend_free(heap,page,stats);
+  mi_assert(mi_page_immediate_available(page));
+}
+
+/* -----------------------------------------------------------
+  Find pages with free blocks
+-------------------------------------------------------------*/
+
+// Find a page with free blocks of `page->block_size`.
+static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq)
+{
+  // search through the pages in "next fit" order
+  mi_page_t* rpage = NULL;
+  size_t count = 0;
+  size_t page_free_count = 0;
+  mi_page_t* page = pq->first;
+  while( page != NULL)
+  {
+    mi_page_t* next = page->next; // remember next
+    count++;
+
+    // 0. collect freed blocks by us and other threads
+    _mi_page_free_collect(page);
+
+    // 1. if the page contains free blocks, we are done
+    if (mi_page_immediate_available(page)) {
+      // If all blocks are free, we might retire this page instead.
+      // do this at most 8 times to bound allocation time.
+      // (note: this can happen if a page was earlier not retired due
+      //  to having neighbours that were mostly full or due to concurrent frees)
+      if (page_free_count < 8 && mi_page_all_free(page)) {
+        page_free_count++;
+        if (rpage != NULL) _mi_page_free(rpage,pq,false);
+        rpage = page;
+        page = next;
+        continue;     // and keep looking
+      }
+      else {
+        break;  // pick this one
+      }
+    }
+
+    // 2. Try to extend
+    if (page->capacity < page->reserved) {
+      mi_page_extend_free(heap, page, &heap->tld->stats);
+      mi_assert_internal(mi_page_immediate_available(page));
+      break;
+    }
+
+    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    mi_assert_internal(!page->flags.in_full && !mi_page_immediate_available(page));
+    mi_page_to_full(page,pq);
+
+    page = next;
+  } // for each page
+
+  mi_stat_counter_increase(heap->tld->stats.searches,count);
+
+  if (page == NULL) {
+    page = rpage;
+    rpage = NULL;
+  }
+  if (rpage != NULL) {
+    _mi_page_free(rpage,pq,false);
+  }
+
+  if (page == NULL) {
+    page = mi_page_fresh(heap, pq);
+  }
+  else {
+    mi_assert(pq->first == page);
+  }
+  mi_assert_internal(mi_page_immediate_available(page));
+  return page;
+}
+
+
+// Find a page with free blocks of `size`.
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+  _mi_heap_delayed_free(heap);
+  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_t* page = pq->first;
+  if (page != NULL) {
+    if (mi_option_get(mi_option_secure) >= 3 && page->capacity < page->reserved && ((_mi_heap_random(heap) & 1) == 1)) {
+      // in secure mode, we extend half the time to increase randomness
+      mi_page_extend_free(heap, page, &heap->tld->stats);
+      mi_assert_internal(mi_page_immediate_available(page));
+    }
+    else {
+      _mi_page_free_collect(page);
+    }
+    if (mi_page_immediate_available(page)) {
+      return page; // fast path
+    }
+  }
+  return mi_page_queue_find_free_ex(heap, pq);
+}
+
+
+/* -----------------------------------------------------------
+  Users can register a deferred free function called
+  when the `free` list is empty. Since the `local_free`
+  is separate this is deterministically called after
+  a certain number of allocations.
+----------------------------------------------------------- */
+
+static mi_deferred_free_fun* deferred_free = NULL;
+
+void _mi_deferred_free(mi_heap_t* heap, bool force) {
+  heap->tld->heartbeat++;
+  if (deferred_free != NULL) {
+    deferred_free(force, heap->tld->heartbeat);
+  }
+}
+
+void mi_register_deferred_free(mi_deferred_free_fun* fn) mi_attr_noexcept {
+  deferred_free = fn;
+}
+
+
+/* -----------------------------------------------------------
+  General allocation
+----------------------------------------------------------- */
+
+// A huge page is allocated directly without being in a queue
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
+  size_t block_size = _mi_wsize_from_size(size) * sizeof(uintptr_t);
+  mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);
+  mi_page_queue_t* pq = mi_page_queue(heap,block_size);
+  mi_assert_internal(mi_page_queue_is_huge(pq));
+  mi_page_t* page = mi_page_fresh_alloc(heap,pq,block_size);
+  if (page != NULL) {
+    mi_assert_internal(mi_page_immediate_available(page));
+    mi_assert_internal(page->block_size == block_size);
+    mi_heap_stat_increase( heap, huge, block_size);
+  }
+  return page;
+}
+
+
+// Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
+{
+  mi_assert_internal(heap != NULL);
+
+  // initialize if necessary
+  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
+    mi_thread_init(); // calls `_mi_heap_init` in turn
+    heap = mi_get_default_heap();
+  }
+  mi_assert_internal(mi_heap_is_initialized(heap));
+
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+
+  // huge allocation?
+  mi_page_t* page;
+  if (mi_unlikely(size > MI_LARGE_SIZE_MAX)) {
+    page = mi_huge_page_alloc(heap,size);
+  }
+  else {
+    // otherwise find a page with free blocks in our size segregated queues
+    page = mi_find_free_page(heap,size);
+  }
+  if (page == NULL) return NULL; // out of memory
+
+  mi_assert_internal(mi_page_immediate_available(page));
+  mi_assert_internal(page->block_size >= size);
+
+  // and try again, this time succeeding! (i.e. this should never recurse)
+  return _mi_page_malloc(heap, page, size);
+}
--- a/src/segment.c
+++ b/src/segment.c
@ -0,0 +1,647 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <stdio.h>
+
+/* -----------------------------------------------------------
+  Segment allocation
+  We allocate pages inside big OS allocated "segments"
+  (2mb on 64-bit). This is to avoid splitting VMA's on Linux
+  and reduce fragmentation on other OS's. Each thread
+  owns its own segments.
+
+  Currently we have:
+  - small pages (64kb), 32 in one segment
+  - large pages (2mb), 1 in one segment
+  - huge blocks > RC_LARGE_SIZE_MAX (256kb) are directly allocated by the OS
+
+  It might be good to have "medium" pages too (of, say 256kb)
+  to reduce pressure on the virtual address space on 32-bit systems
+  but for now we choose the simpler implementation since this
+  will only be a problem if multiple threads allocate many
+  differently sized objects between 8kb and 2mb which is not common.
+
+  In any case the memory for a segment is virtual and only
+  committed on demand (i.e. we are careful to not touch the memory
+  until we actually allocate a block there)
+
+  If a  thread ends, it "abandons" pages with used blocks
+  and there is an abandoned segment list whose segments can
+  be reclaimed by still running threads, much like work-stealing.
+----------------------------------------------------------- */
+
+
+#if (MI_DEBUG > 1)
+static bool mi_segment_is_valid(mi_segment_t* segment) {
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->used <= segment->capacity);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  size_t nfree = 0;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    if (!segment->pages[i].segment_in_use) nfree++;
+  }
+  mi_assert_internal(nfree + segment->used == segment->capacity);
+  mi_assert_internal(segment->thread_id == _mi_thread_id()); // or 0
+  return true;
+}
+#endif
+
+/* -----------------------------------------------------------
+  Queue of segments containing free pages
+----------------------------------------------------------- */
+
+
+#if (MI_DEBUG>1)
+static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_internal(segment != NULL);
+  mi_segment_t* list = queue->first;
+  while (list != NULL) {
+    if (list == segment) break;
+    mi_assert_internal(list->next==NULL || list->next->prev == list);
+    mi_assert_internal(list->prev==NULL || list->prev->next == list);
+    list = list->next;
+  }
+  return (list == segment);
+}
+#endif
+
+// quick test to see if a segment is in the free pages queue
+static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  bool in_queue = (segment->next != NULL || segment->prev != NULL || tld->small_free.first == segment);
+  if (in_queue) {
+    mi_assert(segment->page_kind == MI_PAGE_SMALL); // for now we only support small pages
+    mi_assert_expensive(mi_segment_queue_contains(&tld->small_free, segment));
+  }
+  return in_queue;
+}
+
+static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) {
+  return (queue->first == NULL);
+}
+
+static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(mi_segment_queue_contains(queue, segment));
+  if (segment->prev != NULL) segment->prev->next = segment->next;
+  if (segment->next != NULL) segment->next->prev = segment->prev;
+  if (segment == queue->first) queue->first = segment->next;
+  if (segment == queue->last)  queue->last = segment->prev;
+  segment->next = NULL;
+  segment->prev = NULL;
+}
+
+static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(!mi_segment_queue_contains(queue, segment));
+  segment->next = NULL;
+  segment->prev = queue->last;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = segment;
+    queue->last = segment;
+  }
+  else {
+    queue->last = queue->first = segment;
+  }
+}
+
+
+// Start of the page available memory
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) 
+{  
+  size_t   psize = (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
+  uint8_t* p     = (uint8_t*)segment + page->segment_idx*psize;
+ 
+ if (page->segment_idx == 0) {
+    // the first page starts after the segment info (and possible guard page)
+    p     += segment->segment_info_size;
+    psize -= segment->segment_info_size;
+  }
+  long secure = mi_option_get(mi_option_secure);
+  if (secure > 1 || (secure == 1 && page->segment_idx == segment->capacity - 1)) {
+    // secure == 1: the last page has an os guard page at the end
+    // secure >  1: every page has an os guard page
+    psize -= _mi_os_page_size();
+  }
+  
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(_mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) {
+  /*
+  if (mi_option_is_enabled(mi_option_secure)) {
+    // always reserve maximally so the protection falls on 
+    // the same address area, as we need to reuse them from the caches interchangably.
+    capacity = MI_SMALL_PAGES_PER_SEGMENT;  
+  }
+  */
+  size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
+  size_t guardsize = 0;
+  size_t isize     = 0;
+
+  if (!mi_option_is_enabled(mi_option_secure)) {
+    // normally no guard pages
+    isize = _mi_align_up(minsize, (16 > MI_MAX_ALIGN_SIZE ? 16 : MI_MAX_ALIGN_SIZE));
+  }
+  else {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data (and one at the end of the segment)
+    size_t page_size = _mi_os_page_size();
+    isize = _mi_align_up(minsize, page_size);
+    guardsize = page_size;
+    required = _mi_align_up(required, page_size);
+  }
+;
+  if (info_size != NULL) *info_size = isize;
+  if (pre_size != NULL)  *pre_size  = isize + guardsize;
+  return (required==0 ? MI_SEGMENT_SIZE : required + isize + 2*guardsize);
+}
+
+
+/* -----------------------------------------------------------
+Segment caches
+We keep a small segment cache per thread to avoid repeated allocation
+and free in the OS if a program allocates memory and then frees
+all again repeatedly. (We tried a one-element cache but that
+proves to be too small for certain workloads).
+----------------------------------------------------------- */
+
+static void mi_segments_count_add(long inc, mi_segments_tld_t* tld) {
+  if (inc>=0) mi_stat_increase(tld->stats->segments,inc);
+         else mi_stat_decrease(tld->stats->segments,-inc);
+  mi_assert_internal(inc < 0 ? tld->count >= (size_t)(-inc) : tld->count < (SIZE_MAX - inc));
+  mi_assert_internal(tld->peak >= tld->count);
+  tld->count += inc;
+  if (tld->count > tld->peak) tld->peak = tld->count;
+}
+
+static size_t mi_segments_peak(mi_segments_tld_t* tld) {
+  return tld->peak;
+}
+
+static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
+  mi_segments_count_add(-1,tld);
+  _mi_os_free(segment, segment_size,tld->stats);
+}
+
+// The segment cache is limited to be at most 1/2 of the peak
+// number of segments in use (and no more than 32)
+#define MI_SEGMENT_CACHE_MAX (16)
+#define MI_SEGMENT_CACHE_FRACTION (6)
+
+
+static mi_segment_t* mi_segment_cache_pop(mi_segments_tld_t* tld) {
+  mi_segment_t* segment = tld->cache;
+  if (segment == NULL) return NULL;
+  tld->cache_count--;
+  tld->cache = segment->next;
+  segment->next = NULL;
+  return segment;
+}
+
+static bool mi_segment_cache_full(mi_segments_tld_t* tld) {
+  if (tld->cache_count < MI_SEGMENT_CACHE_MAX &&
+      tld->cache_count*MI_SEGMENT_CACHE_FRACTION < mi_segments_peak(tld)) return false;
+  // take the opportunity to reduce the segment cache if it is too large (now)
+  while (tld->cache_count*MI_SEGMENT_CACHE_FRACTION >= mi_segments_peak(tld) + 1) {
+    mi_segment_t* segment = mi_segment_cache_pop(tld);
+    mi_assert_internal(segment != NULL);
+    if (segment != NULL) mi_segment_os_free(segment, MI_SEGMENT_SIZE, tld);
+  }
+  return true;
+}
+
+static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(!mi_segment_is_in_free_queue(segment,tld));
+  mi_assert_internal(segment->next==NULL);
+  if (mi_segment_cache_full(tld)) return false;
+
+  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
+  if (mi_option_is_enabled(mi_option_cache_reset) && !mi_option_is_enabled(mi_option_page_reset)) {
+    _mi_os_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size);
+  }
+  segment->next = tld->cache;
+  tld->cache = segment;
+  tld->cache_count++;
+  return true;
+}
+
+// called by ending threads to free cached segments
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
+  mi_segment_t* segment;
+  while ((segment = mi_segment_cache_pop(tld)) != NULL) {
+    mi_segment_os_free(segment, MI_SEGMENT_SIZE, tld);
+  }
+  mi_assert_internal(tld->cache_count == 0);
+  mi_assert_internal(tld->cache == NULL);
+}
+
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
+
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_alloc( size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+{  
+  // calculate needed sizes first
+
+  size_t capacity;
+  if (page_kind == MI_PAGE_HUGE) {
+    mi_assert_internal(page_shift==MI_SEGMENT_SHIFT && required > 0);
+    capacity = 1;  
+  }
+  else {
+    mi_assert_internal(required==0);
+    size_t page_size = (size_t)1 << page_shift;
+    capacity = MI_SEGMENT_SIZE / page_size;
+    mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
+    mi_assert_internal(capacity >= 1 && capacity <= MI_SMALL_PAGES_PER_SEGMENT);
+  }
+  size_t info_size;
+  size_t pre_size;
+  size_t segment_size = mi_segment_size( capacity, required, &pre_size, &info_size);
+
+  size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);
+
+  // Allocate the segment
+  mi_segment_t* segment = NULL;
+
+  // try to get it from our caches
+  if (segment_size == MI_SEGMENT_SIZE) {
+    segment = mi_segment_cache_pop(tld);
+    if (segment != NULL && mi_option_is_enabled(mi_option_secure) && segment->page_kind != page_kind) {
+      _mi_os_unprotect(segment,segment->segment_size);
+    }
+  }
+
+  // and otherwise allocate it from the OS
+  if (segment == NULL) {
+    segment = (mi_segment_t*)_mi_os_alloc_aligned(segment_size, MI_SEGMENT_SIZE, os_tld);
+    if (segment == NULL) return NULL;
+    mi_segments_count_add(1,tld);
+  }
+
+  mi_assert_internal((uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+  
+  memset(segment, 0, info_size);  
+  if (mi_option_is_enabled(mi_option_secure)) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data
+    mi_assert_internal( info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
+    _mi_os_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
+    size_t os_page_size = _mi_os_page_size();
+    if (mi_option_get(mi_option_secure) <= 1) {
+      // and protect the last page too      
+      _mi_os_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
+    }
+    else {
+      // protect every page
+      for (size_t i = 0; i < capacity; i++) {
+        _mi_os_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
+      }
+    }
+  }
+  
+  segment->page_kind  = page_kind;
+  segment->capacity   = capacity;
+  segment->page_shift = page_shift;
+  segment->segment_size = segment_size;
+  segment->segment_info_size = pre_size;
+  segment->thread_id  = _mi_thread_id();
+  segment->cookie = _mi_ptr_cookie(segment);
+  for (uint8_t i = 0; i < segment->capacity; i++) {
+    segment->pages[i].segment_idx = i;
+  }
+  mi_stat_increase(tld->stats->committed, segment->segment_info_size);
+  //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
+  return segment;
+}
+
+#if MI_STAT
+// Available memory in a page
+static size_t mi_page_size(const mi_page_t* page) {
+  size_t psize;
+  _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  return psize;
+}
+#endif
+
+static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  //fprintf(stderr,"mimalloc: free segment at %p\n", (void*)segment);
+  mi_assert(segment != NULL);
+  if (mi_segment_is_in_free_queue(segment,tld)) {
+    if (segment->page_kind != MI_PAGE_SMALL) {
+      fprintf(stderr, "mimalloc: expecting small segment: %i, %p, %p, %p\n", segment->page_kind, segment->prev, segment->next, tld->small_free.first);
+      fflush(stderr);
+    }
+    else {
+      mi_assert_internal(segment->page_kind == MI_PAGE_SMALL); // for now we only support small pages
+      mi_assert_expensive(mi_segment_queue_contains(&tld->small_free, segment));
+      mi_segment_queue_remove(&tld->small_free, segment);
+    }
+  }
+  mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
+  mi_assert(segment->next == NULL);
+  mi_assert(segment->prev == NULL);
+  mi_stat_decrease( tld->stats->committed, segment->segment_info_size);
+  segment->thread_id = 0;
+
+  // update reset memory statistics
+  for (uint8_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->is_reset) {
+      page->is_reset = false;
+      mi_stat_decrease( tld->stats->reset,mi_page_size(page));
+    }
+  }
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    mi_segment_os_free(segment, segment->segment_size, tld);
+  }
+  else if (!force && mi_segment_cache_push(segment, tld)) {
+    // it is put in our cache
+  }
+  else {
+    // otherwise return it to the OS
+    mi_segment_os_free(segment, MI_SEGMENT_SIZE,tld);
+  }
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Free page management inside a segment
+----------------------------------------------------------- */
+
+
+static bool mi_segment_has_free(const mi_segment_t* segment) {
+  return (segment->used < segment->capacity);
+}
+
+static mi_page_t* mi_segment_find_free(mi_segment_t* segment) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  mi_assert_expensive(mi_segment_is_valid(segment));
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (!page->segment_in_use) {
+      return page;
+    }
+  }
+  mi_assert(false);
+  return NULL;
+}
+
+
+/* -----------------------------------------------------------
+   Free
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
+
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_assert_internal(page->segment_in_use);
+  mi_assert_internal(mi_page_all_free(page));
+  size_t inuse = page->capacity * page->block_size;
+  mi_stat_decrease( stats->committed, inuse);
+  mi_stat_decrease( stats->pages, 1);
+
+  // reset the page memory to reduce memory pressure?
+  if (!page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+    size_t psize;
+    uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+    mi_stat_increase( stats->reset, psize);  // for stats we assume resetting the full page
+    page->is_reset = true;
+    if (inuse > 0) {
+      _mi_os_reset(start, inuse);
+    }
+  }
+
+  // zero the page data
+  uint8_t idx = page->segment_idx; // don't clear the index
+  bool is_reset = page->is_reset;  // don't clear the reset flag
+  memset(page, 0, sizeof(*page));
+  page->segment_idx = idx;
+  page->segment_in_use = false;
+  page->is_reset = is_reset;
+  segment->used--;
+}
+
+void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
+{
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+
+  // mark it as free now
+  mi_segment_page_clear(segment, page, tld->stats);
+
+  if (segment->used == 0) {
+    // no more used pages; remove from the free list and free the segment
+    mi_segment_free(segment, force, tld);
+  }
+  else {
+    if (segment->used == segment->abandoned) {
+      // only abandoned pages; remove from free list and abandon
+      mi_segment_abandon(segment,tld);
+    }
+    else if (segment->used + 1 == segment->capacity) {
+      mi_assert_internal(segment->page_kind == MI_PAGE_SMALL); // for now we only support small pages
+      // move back to segments small pages free list
+      mi_segment_enqueue(&tld->small_free, segment);
+    }
+  }
+}
+
+
+/* -----------------------------------------------------------
+   Abandonment
+----------------------------------------------------------- */
+
+// When threads terminate, they can leave segments with
+// live blocks (reached through other threads). Such segments
+// are "abandoned" and will be reclaimed by other threads to
+// reuse their pages and/or free them eventually
+static volatile mi_segment_t* abandoned = NULL;
+static volatile uintptr_t     abandoned_count = 0;
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+  // remove the segment from the free page queue if needed
+  if (mi_segment_is_in_free_queue(segment,tld)) {
+    mi_assert(segment->page_kind == MI_PAGE_SMALL); // for now we only support small pages
+    mi_assert_expensive(mi_segment_queue_contains(&tld->small_free, segment));
+    mi_segment_queue_remove(&tld->small_free, segment);
+  }
+  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
+  // all pages in the segment are abandoned; add it to the abandoned list
+  segment->thread_id = 0;
+  do {
+    segment->abandoned_next = (mi_segment_t*)abandoned;
+  } while (!mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment, segment->abandoned_next));
+  mi_atomic_increment(&abandoned_count);
+  mi_stat_increase( tld->stats->segments_abandoned,1);
+}
+
+void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+  segment->abandoned++;
+  mi_stat_increase( tld->stats->pages_abandoned, 1);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  if (segment->used == segment->abandoned) {
+    // all pages are abandoned, abandon the entire segment
+    mi_segment_abandon(segment,tld);
+  }
+}
+
+bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld) {
+  uintptr_t reclaimed = 0;
+  uintptr_t atmost;
+  if (try_all) {
+    atmost = abandoned_count+16;   // close enough
+  }
+  else {
+    atmost = abandoned_count/8;    // at most 1/8th of all outstanding (estimated)
+    if (atmost < 8) atmost = 8;    // but at least 8
+  }
+
+  // for `atmost` `reclaimed` abandoned segments...
+  while(atmost > reclaimed) {
+    // try to claim the head of the abandoned segments
+    mi_segment_t* segment;
+    do {
+      segment = (mi_segment_t*)abandoned;
+    } while(segment != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment->abandoned_next, segment));
+    if (segment==NULL) break; // stop early if no more segments available
+
+    // got it.
+    mi_atomic_decrement(&abandoned_count);
+    segment->thread_id = _mi_thread_id();
+    segment->abandoned_next = NULL;
+    mi_segments_count_add(1,tld);
+    mi_assert_internal(segment->next == NULL && segment->prev == NULL);
+    mi_assert_expensive(mi_segment_is_valid(segment));
+    mi_stat_decrease(tld->stats->segments_abandoned,1);
+    // add its free pages to the the current thread
+    if (segment->page_kind == MI_PAGE_SMALL && mi_segment_has_free(segment)) {
+      mi_segment_enqueue(&tld->small_free, segment);
+    }
+    // add its abandoned pages to the current thread
+    mi_assert(segment->abandoned == segment->used);
+    for (size_t i = 0; i < segment->capacity; i++) {
+      mi_page_t* page = &segment->pages[i];
+      if (page->segment_in_use) {
+        segment->abandoned--;
+        mi_assert(page->next == NULL);
+        mi_stat_decrease( tld->stats->pages_abandoned, 1);
+        if (mi_page_all_free(page)) {
+          // if everything free by now, free the page
+          mi_segment_page_clear(segment,page,tld->stats);
+        }
+        else {
+          // otherwise reclaim it
+          _mi_page_reclaim(heap,page);
+        }
+      }
+    }
+    mi_assert(segment->abandoned == 0);
+    if (segment->used == 0) {  // due to page_clear
+      mi_segment_free(segment,false,tld);
+    }
+    else {
+      reclaimed++;
+    }
+  }
+  return (reclaimed>0);
+}
+
+
+/* -----------------------------------------------------------
+   Small page allocation
+----------------------------------------------------------- */
+
+// Allocate a small page inside a segment.
+// Requires that the page has free pages
+static mi_page_t* mi_segment_small_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  mi_page_t* page = mi_segment_find_free(segment);
+  page->segment_in_use = true;
+  segment->used++;
+  mi_assert_internal(segment->used <= segment->capacity);
+  if (segment->used == segment->capacity) {
+    // if no more free pages, remove from the queue
+    mi_assert_internal(!mi_segment_has_free(segment));
+    mi_assert_expensive(mi_segment_queue_contains(&tld->small_free, segment));
+    mi_segment_queue_remove(&tld->small_free, segment);
+  }
+  return page;
+}
+
+static mi_page_t* mi_segment_small_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  if (mi_segment_queue_is_empty(&tld->small_free)) {
+    mi_segment_t* segment = mi_segment_alloc(0,MI_PAGE_SMALL,MI_SMALL_PAGE_SHIFT,tld,os_tld);
+    if (segment == NULL) return NULL;
+    mi_segment_enqueue(&tld->small_free, segment);
+  }
+  mi_assert_internal(tld->small_free.first != NULL);
+  return mi_segment_small_page_alloc_in(tld->small_free.first,tld);
+}
+
+
+/* -----------------------------------------------------------
+   large page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_segment_t* segment = mi_segment_alloc(0,MI_PAGE_LARGE,MI_LARGE_PAGE_SHIFT,tld,os_tld);
+  if (segment == NULL) return NULL;
+  segment->used = 1;
+  mi_page_t* page = &segment->pages[0];
+  page->segment_in_use = true;
+  return page;
+}
+
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+{  
+  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT,tld,os_tld);
+  if (segment == NULL) return NULL;
+  mi_assert_internal(segment->segment_size - segment->segment_info_size >= size);
+  segment->used = 1;
+  mi_page_t* page = &segment->pages[0];
+  page->segment_in_use = true;
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Page allocation and free
+----------------------------------------------------------- */
+
+mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page;
+  if (block_size < MI_SMALL_PAGE_SIZE / 8)
+    // smaller blocks than 8kb (assuming MI_SMALL_PAGE_SIZE == 64kb)
+    page = mi_segment_small_page_alloc(tld,os_tld);
+  else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t)))
+    page = mi_segment_large_page_alloc(tld, os_tld);
+  else
+    page = mi_segment_huge_page_alloc(block_size,tld,os_tld);
+  mi_assert_expensive(mi_segment_is_valid(_mi_page_segment(page)));
+  return page;
+}
--- a/src/static.c
+++ b/src/static.c
@ -0,0 +1,24 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#define _DEFAULT_SOURCE
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// For a static override we create a single object file
+// containing the whole library. If it is linked first
+// it will override all the standard library allocation
+// functions (on Unix's).
+#include "stats.c"
+#include "os.c"
+#include "segment.c"
+#include "page.c"
+#include "heap.c"
+#include "alloc.c"
+#include "alloc-aligned.c"
+#include "init.c"
+#include "options.c"
--- a/src/stats.c
+++ b/src/stats.c
@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"license.txt" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h> // memset
+
+
+/* -----------------------------------------------------------
+  Merge thread statistics with the main one.
+----------------------------------------------------------- */
+
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src);
+
+void _mi_stats_done(mi_stats_t* stats) {
+  if (stats == &_mi_stats_main) return;
+  mi_stats_add(&_mi_stats_main, stats);
+  memset(stats,0,sizeof(*stats));
+}
+
+
+/* -----------------------------------------------------------
+  Statistics operations
+----------------------------------------------------------- */
+
+static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  bool in_main = ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+                  && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+  if (in_main)
+  {
+    // add atomically (for abandoned pages)
+    int64_t current = mi_atomic_add(&stat->current,amount);
+    if (current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
+    if (amount > 0) {
+      mi_atomic_add(&stat->allocated,amount);
+    }
+    else {
+      mi_atomic_add(&stat->freed, -amount);
+    }
+  }
+  else {
+    // add thread local
+    stat->current += amount;
+    if (stat->current > stat->peak) stat->peak = stat->current;
+    if (amount > 0) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += -amount;
+    }
+  }
+}
+
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  // TODO: add thread safe code
+  stat->count++;
+  stat->total += amount;
+}
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, (int64_t)amount);
+}
+
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, -((int64_t)amount));
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_add( &stat->allocated, src->allocated * unit);
+  mi_atomic_add( &stat->current, src->current * unit);
+  mi_atomic_add( &stat->freed, src->freed * unit);
+  mi_atomic_add( &stat->peak, src->peak * unit);
+}
+
+static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_add( &stat->total, src->total * unit);
+  mi_atomic_add( &stat->count, src->count * unit);
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
+  if (stats==src) return;
+  mi_stat_add(&stats->segments, &src->segments,1);
+  mi_stat_add(&stats->pages, &src->pages,1);
+  mi_stat_add(&stats->reserved, &src->reserved, 1);
+  mi_stat_add(&stats->committed, &src->committed, 1);
+  mi_stat_add(&stats->reset, &src->reset, 1);
+
+  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
+  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
+  mi_stat_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_add(&stats->mmap_ensure_aligned, &src->mmap_ensure_aligned, 1);
+  mi_stat_add(&stats->mmap_right_align, &src->mmap_right_align, 1);
+  mi_stat_add(&stats->threads, &src->threads, 1);
+  mi_stat_add(&stats->pages_extended, &src->pages_extended, 1);
+
+  mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+#if MI_STAT>1
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    if (src->normal[i].allocated > 0 || src->normal[i].freed > 0) {
+      mi_stat_add(&stats->normal[i], &src->normal[i], 1);
+    }
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+  Display statistics
+----------------------------------------------------------- */
+
+static void mi_printf_amount(int64_t n, int64_t unit, FILE* out, const char* fmt) {
+  char buf[32];
+  int  len = 32;
+  char* suffix = (unit <= 0 ? " " : "b");
+  double base = (unit == 0 ? 1000.0 : 1024.0);
+  if (unit>0) n *= unit;
+
+  double pos = (double)(n < 0 ? -n : n);
+  if (pos < base)
+    snprintf(buf,len, "%d %s ", (int)n, suffix);
+  else if (pos < base*base)
+    snprintf(buf, len, "%.1f k%s", (double)n / base, suffix);
+  else if (pos < base*base*base)
+    snprintf(buf, len, "%.1f m%s", (double)n / (base*base), suffix);
+  else
+    snprintf(buf, len, "%.1f g%s", (double)n / (base*base*base), suffix);
+
+  _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
+}
+
+#if MI_STAT>0
+static void mi_print_amount(int64_t n, int64_t unit, FILE* out) {
+  mi_printf_amount(n,unit,out,NULL);
+}
+
+static void mi_print_count(int64_t n, int64_t unit, FILE* out) {
+  if (unit==1) _mi_fprintf(out,"%11s"," ");
+          else mi_print_amount(n,0,out);
+}
+
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, FILE* out ) {
+  _mi_fprintf(out,"%10s:", msg);
+  mi_print_amount(stat->peak, unit, out);
+  if (unit!=0) {
+    mi_print_amount(stat->allocated, unit, out);
+    mi_print_amount(stat->freed, unit, out);
+  }
+  if (unit>0) {
+    mi_print_amount(unit, (unit==0 ? 0 : 1), out);
+    mi_print_count(stat->allocated, unit, out);
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, "  not all freed!\n");
+    else
+      _mi_fprintf(out, "  ok\n");
+  }
+  else {
+    _mi_fprintf(out, "\n");
+  }
+}
+
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, FILE* out ) {
+  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
+  _mi_fprintf(out,"%10s: %7.1f avg\n", msg, avg);
+}
+#endif
+
+static void mi_print_header( FILE* out ) {
+  _mi_fprintf(out,"%10s: %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "unit  ", "count  ");
+}
+
+#if MI_STAT>1
+static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bins, size_t max, const char* fmt, FILE* out) {
+  bool found = false;
+  char buf[64];
+  for (size_t i = 0; i <= max; i++) {
+    if (bins[i].allocated > 0) {
+      found = true;
+      int64_t unit = _mi_bin_size((uint8_t)i);
+      snprintf(buf, 64, "%s %3zd", fmt, i);
+      mi_stat_add(all, &bins[i], unit);
+      mi_stat_print(&bins[i], buf, unit, out);
+    }
+  }
+  //snprintf(buf, 64, "%s all", fmt);
+  //mi_stat_print(all, buf, 1);
+  if (found) {
+    _mi_fprintf(out, "\n");
+    mi_print_header(out);
+  }
+}
+#endif
+
+
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim);
+
+static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_noexcept {
+  if (out == NULL) out = stderr;
+  mi_print_header(out);
+#if !defined(MI_STAT) || (MI_STAT==0)
+  UNUSED(stats);
+  //_mi_fprintf(out,"(mimalloc built without statistics)\n");
+#else
+  #if MI_STAT>1
+  mi_stat_count_t normal = { 0,0,0,0 };
+  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out);
+  mi_stat_print(&normal, "normal", 1, out);
+  #endif
+  mi_stat_print(&stats->huge, "huge", 1, out);
+  #if MI_STAT>1
+  mi_stat_count_t total = { 0,0,0,0 };
+  mi_stat_add(&total, &normal, 1);
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_print(&total, "total", 1, out);
+  #endif
+  _mi_fprintf(out, "malloc requested:     ");
+  mi_print_amount(stats->malloc.allocated, 1, out);
+  _mi_fprintf(out, "\n\n");
+  mi_stat_print(&stats->committed, "committed", 1, out);
+  mi_stat_print(&stats->reserved, "reserved", 1, out);
+  mi_stat_print(&stats->reset, "reset", -1, out);
+  mi_stat_print(&stats->segments, "segments", -1, out);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out);
+  mi_stat_print(&stats->pages, "pages", -1, out);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out);
+  mi_stat_print(&stats->pages_extended, "-extended", 0, out);
+  mi_stat_print(&stats->mmap_calls, "mmaps", 0, out);
+  mi_stat_print(&stats->mmap_right_align, "mmap fast", 0, out);
+  mi_stat_print(&stats->mmap_ensure_aligned, "mmap slow", 0, out);
+  mi_stat_print(&stats->threads, "threads", 0, out);
+  mi_stat_counter_print(&stats->searches, "searches", out);
+#endif
+
+  if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
+
+  double user_time;
+  double sys_time;
+  size_t peak_rss;
+  size_t page_faults;
+  size_t page_reclaim;
+  mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim);
+  _mi_fprintf(out,"%10s: user: %.3f s, system: %.3f s, faults: %lu, reclaims: %lu, rss: ", "process", user_time, sys_time, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  mi_printf_amount((int64_t)peak_rss, 1, out, "%s");
+  _mi_fprintf(out,"\n");
+}
+
+static double mi_clock_end(double start);
+static double mi_clock_start();
+static double mi_time_start = 0.0;
+
+static mi_stats_t* mi_stats_get_default() {
+  mi_heap_t* heap = mi_heap_get_default();
+  return &heap->tld->stats;
+}
+
+void mi_stats_reset() mi_attr_noexcept {
+  mi_stats_t* stats = mi_stats_get_default();
+  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_time_start = mi_clock_start();
+}
+
+static void mi_stats_print_ex(mi_stats_t* stats, double secs, FILE* out) {
+  if (stats != &_mi_stats_main) {
+    mi_stats_add(&_mi_stats_main,stats);
+    memset(stats,0,sizeof(mi_stats_t));
+  }
+  _mi_stats_print(&_mi_stats_main, secs, out);
+}
+
+void mi_stats_print(FILE* out) mi_attr_noexcept {
+  mi_stats_print_ex(mi_stats_get_default(),mi_clock_end(mi_time_start),out);
+}
+
+void mi_thread_stats_print(FILE* out) mi_attr_noexcept {
+  _mi_stats_print(mi_stats_get_default(), mi_clock_end(mi_time_start), out);
+}
+
+
+
+// --------------------------------------------------------
+// Basic timer for convenience
+// --------------------------------------------------------
+
+#ifdef _WIN32
+#include <windows.h>
+static double mi_to_seconds(LARGE_INTEGER t) {
+  static double freq = 0.0;
+  if (freq <= 0.0) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    freq = (double)(f.QuadPart);
+  }
+  return ((double)(t.QuadPart) / freq);
+}
+
+static double mi_clock_now() {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_seconds(t);
+}
+#else
+#include <time.h>
+#ifdef TIME_UTC
+static double mi_clock_now() {
+  struct timespec t;
+  timespec_get(&t, TIME_UTC);
+  return (double)t.tv_sec + (1.0e-9 * (double)t.tv_nsec);
+}
+#else
+// low resolution timer
+static double mi_clock_now() {
+  return ((double)clock() / (double)CLOCKS_PER_SEC);
+}
+#endif
+#endif
+
+
+static double mi_clock_diff = 0.0;
+
+static double mi_clock_start() {
+  if (mi_clock_diff == 0.0) {
+    double t0 = mi_clock_now();
+    mi_clock_diff = mi_clock_now() - t0;
+  }
+  return mi_clock_now();
+}
+
+static double mi_clock_end(double start) {
+  double end = mi_clock_now();
+  return (end - start - mi_clock_diff);
+}
+
+
+// --------------------------------------------------------
+// Basic process statistics
+// --------------------------------------------------------
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <psapi.h>
+#pragma comment(lib,"psapi.lib")
+
+static double filetime_secs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  double secs = (double)(i.QuadPart) * 1.0e-7; // FILETIME is in 100 nano seconds
+  return secs;
+}
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim) {
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  *utime = filetime_secs(&ut);
+  *stime = filetime_secs(&st);
+
+  PROCESS_MEMORY_COUNTERS info;
+  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  *peak_rss = (size_t)info.PeakWorkingSetSize;
+  *page_faults = (size_t)info.PageFaultCount;
+  *page_reclaim = 0;
+}
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach/mach.h>
+#endif
+
+static double timeval_secs(const struct timeval* tv) {
+  return (double)tv->tv_sec + ((double)tv->tv_usec * 1.0e-6);
+}
+
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim) {
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+#if defined(__APPLE__) && defined(__MACH__)
+  *peak_rss = rusage.ru_maxrss;
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;
+#endif
+  *page_faults = rusage.ru_majflt;
+  *page_reclaim = rusage.ru_minflt;
+  *utime = timeval_secs(&rusage.ru_utime);
+  *stime = timeval_secs(&rusage.ru_stime);
+}
+
+#else
+#pragma message("define a way to get process info")
+static size_t mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim) {
+  *peak_rss = 0;
+  *page_faults = 0;
+  *page_reclaim = 0;
+  *utime = 0.0;
+  *stime = 0.0;
+}
+#endif