From 03b363a1c289ad4461c219050466a9f7de0b8432 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Wed, 29 Jan 2020 22:46:44 -0800
Subject: [PATCH 01/17] first working tls on macOS using interpose; still slow

---
 CMakeLists.txt              |  2 +-
 include/mimalloc-internal.h | 38 +++++++++++-------
 src/alloc-override.c        |  7 +++-
 src/alloc.c                 |  2 +-
 src/init.c                  | 62 ++++++++++++++++------------
 src/options.c               | 32 ++++++++++-----
 src/random.c                | 34 ++++++++--------
 src/segment.c               | 80 ++++++++++++++++++-------------------
 test/test-stress.c          | 18 ++++-----
 9 files changed, 155 insertions(+), 120 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b60e64a4..2da7974b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,7 +247,7 @@ if (MI_BUILD_TESTS MATCHES "ON")
   target_compile_definitions(mimalloc-test-stress PRIVATE ${mi_defines})
   target_compile_options(mimalloc-test-stress PRIVATE ${mi_cflags})
   target_include_directories(mimalloc-test-stress PRIVATE include)
-  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc-static ${mi_libraries})
+  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc ${mi_libraries})
 
   enable_testing()
   add_test(test_api, mimalloc-test-api)
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index c7d7a1da..f4b578f6 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -33,7 +33,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align     
+#define mi_decl_cache_align
 #endif
 
 
@@ -51,6 +51,7 @@ void       _mi_random_init(mi_random_ctx_t* ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
@@ -233,7 +234,7 @@ static inline size_t _mi_wsize_from_size(size_t size) {
 
 
 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {  
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
 #if (SIZE_MAX == UINT_MAX)
@@ -274,18 +275,24 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 
-static inline mi_heap_t* mi_get_default_heap(void) {
 #ifdef MI_TLS_RECURSE_GUARD
+extern mi_heap_t* _mi_get_default_heap_tls_safe(void);
+static inline mi_heap_t* mi_get_default_heap(void) {
   // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
   // to initialize thread local data. To avoid recursion, we need to avoid
   // accessing the thread local `_mi_default_heap` until our module is loaded
   // and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  if (!_mi_process_is_initialized) return &_mi_heap_main;
-#endif
+  return _mi_get_default_heap_tls_safe();
+#else
+
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
   return _mi_heap_default;
+
+#endif
 }
 
 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
@@ -302,6 +309,7 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }
 
 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  mi_assert_internal(_mi_heap_main.cookie != 0);
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
 
@@ -345,7 +353,7 @@ static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, con
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
   return &((mi_segment_t*)segment)->pages[idx];
 }
 
@@ -411,14 +419,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
   return mi_tf_make(block, mi_tf_delayed(tf));
 }
 
-// are all blocks in a page freed? 
+// are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
 static inline bool mi_page_all_free(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
   return (page->used == 0);
 }
 
-// are there any available blocks? 
+// are there any available blocks?
 static inline bool mi_page_has_any_available(const mi_page_t* page) {
   mi_assert_internal(page != NULL && page->reserved > 0);
   return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
@@ -466,11 +474,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers
 
-This is to protect against buffer overflow exploits where the 
-free list is mutated. Many hardened allocators xor the next pointer `p` 
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
 with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess 
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
 Moreover, if multiple blocks can be read as well, the attacker can
 xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
 about the pointers (and subsequently `k1`).
@@ -478,9 +486,9 @@ about the pointers (and subsequently `k1`).
 Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
 Since these operations are not associative, the above approaches do not
 work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term, 
+for the read case we can subtract two entries to discard the `+k1` term,
 but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear 
+We include the left-rotation since xor and addition are otherwise linear
 in the lowest bit. Finally, both keys are unique per page which reduces
 the re-use of keys by a large factor.
 
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 89c5126a..58996c5f 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -41,6 +41,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+  static void mi_free_tls_safe(void* p) {
+    if (mi_unlikely(_mi_preloading())) return;
+    mi_free(p);
+  }
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
@@ -54,7 +58,7 @@ terms of the MIT license. A copy of the license can be found in the file
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSE_MI(free),
+    MI_INTERPOSEX(free,mi_free_tls_safe),
     MI_INTERPOSE_MI(strdup),
     MI_INTERPOSE_MI(strndup)
   };
@@ -194,4 +198,3 @@ int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_me
 #endif
 
 #endif // MI_MALLOC_OVERRIDE && !_WIN32
-
diff --git a/src/alloc.c b/src/alloc.c
index 3f577f2f..d60c33bf 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { 
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* block = page->free;
   if (mi_unlikely(block == NULL)) {
diff --git a/src/init.c b/src/init.c
index f8411187..922b7438 100644
--- a/src/init.c
+++ b/src/init.c
@@ -104,9 +104,9 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, 
-    0, 0, 0, 0, 0, 0, NULL, 
-    tld_main_stats, tld_main_os 
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, 0, NULL,
+    tld_main_stats, tld_main_os
   }, // segments
   { 0, tld_main_stats },  // os
   { MI_STATS_NULL }       // stats
@@ -124,9 +124,9 @@ mi_heap_t _mi_heap_main = {
   MI_PAGE_QUEUES_EMPTY,
   ATOMIC_VAR_INIT(NULL),
   0,                // thread id
-  MI_INIT_COOKIE,   // initial cookie
-  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0}, {0}, 0 },  // random
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
   0,                // page count
   false             // can reclaim
 };
@@ -148,14 +148,15 @@ typedef struct mi_thread_data_s {
 
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
+    mi_assert_internal(_mi_heap_main.thread_id != 0);
     // the main heap is statically allocated
     _mi_heap_set_default_direct(&_mi_heap_main);
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
   }
   else {
-    // use `_mi_os_alloc` to allocate directly from the OS    
+    // use `_mi_os_alloc` to allocate directly from the OS
     mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
     if (td == NULL) {
       _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
@@ -170,7 +171,7 @@ static bool _mi_heap_init(void) {
     heap->cookie = _mi_heap_random_next(heap) | 1;
     heap->key[0] = _mi_heap_random_next(heap);
     heap->key[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
@@ -265,8 +266,9 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #endif
 
 // Set up handlers so `mi_thread_done` is called automatically
+static bool tls_initialized = false; // fine if it races
+
 static void mi_process_setup_auto_thread_done(void) {
-  static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -317,7 +319,9 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
+  #ifndef MI_TLS_RECURSE_GUARD
   _mi_heap_default = heap;
+  #endif
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
@@ -330,7 +334,11 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #endif
 }
 
-
+mi_heap_t* _mi_get_default_heap_tls_safe(void) {
+  if (mi_unlikely(mi_pthread_key==0)) return (mi_heap_t*)&_mi_heap_empty;
+  mi_heap_t* heap = pthread_getspecific(mi_pthread_key);
+  return (mi_likely(heap!=NULL) ? heap : (mi_heap_t*)&_mi_heap_empty);
+}
 
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@@ -339,6 +347,7 @@ static void mi_process_done(void);
 
 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+bool _mi_tls_initialized = false;
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool _mi_preloading() {
@@ -383,7 +392,10 @@ static void mi_allocator_done() {
 
 // Called once by the process loader
 static void mi_process_load(void) {
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  UNUSED(dummy);
   os_preloading = false;
+  _mi_tls_initialized = true;
   atexit(&mi_process_done);
   _mi_options_init();
   mi_process_init();
@@ -398,26 +410,26 @@ static void mi_process_load(void) {
   }
 }
 
+void _mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&_mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
   if (_mi_process_is_initialized) return;
-  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
-  // that the TLS slot is allocated without getting into recursion on macOS
-  // when using dynamic linking with interpose.
-  mi_get_default_heap();
   _mi_process_is_initialized = true;
-
-  _mi_heap_main.thread_id = _mi_thread_id();
-  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
-  _mi_random_init(&_mi_heap_main.random);
-  #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
-  #endif
   mi_process_setup_auto_thread_done();
+
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   _mi_os_init();
+  _mi_heap_main_init();
   #if (MI_DEBUG)
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
diff --git a/src/options.c b/src/options.c
index af051aa2..c0bf9680 100644
--- a/src/options.c
+++ b/src/options.c
@@ -53,7 +53,7 @@ static mi_option_desc_t options[_mi_option_last] =
   // stable options
   { MI_DEBUG, UNINIT, MI_OPTION(show_errors) },
   { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
+  { 1, UNINIT, MI_OPTION(verbose) },
 
   // the following options are experimental and not all combinations make sense.
   { 1, UNINIT, MI_OPTION(eager_commit) },        // commit on demand
@@ -239,16 +239,30 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;
 
+static bool mi_recurse_enter(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return true;
+  #endif
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static void mi_recurse_exit(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return;
+  #endif
+  recurse = false;
+}
+
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (recurse) return;
+  if (!mi_recurse_enter()) return;
   if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
     out = mi_out_get_default(&arg);
   }
-  recurse = true;
   if (prefix != NULL) out(prefix,arg);
   out(message,arg);
-  recurse = false;
-  return;
+  mi_recurse_exit();
 }
 
 // Define our own limited `fprintf` that avoids memory allocation.
@@ -256,14 +270,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
-  if (recurse) return;
-  recurse = true;
+  if (!mi_recurse_enter()) return;
   vsnprintf(buf,sizeof(buf)-1,fmt,args);
-  recurse = false;
+  mi_recurse_exit();
   _mi_fputs(out,arg,prefix,buf);
 }
 
-
 void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
   va_list args;
   va_start(args,fmt);
@@ -290,7 +302,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 static void mi_show_error_message(const char* fmt, va_list args) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
   if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }
 
 void _mi_warning_message(const char* fmt, ...) {
diff --git a/src/random.c b/src/random.c
index 6fef2434..b3dbf4f8 100644
--- a/src/random.c
+++ b/src/random.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
-and to avoid implementations that use a lock. We only use the OS provided 
+and to avoid implementations that use a lock. We only use the OS provided
 random source to initialize the initial seeds. Since we do not need ultimate
 performance but we do rely on the security (for secret cookies in secure mode)
 we use a cryptographically secure generator (chacha20).
@@ -21,11 +21,11 @@ we use a cryptographically secure generator (chacha20).
 
 
 /* ----------------------------------------------------------------------------
-Chacha20 implementation as the original algorithm with a 64-bit nonce 
+Chacha20 implementation as the original algorithm with a 64-bit nonce
 and counter: https://en.wikipedia.org/wiki/Salsa20
 The input matrix has sixteen 32-bit values:
 Position  0 to  3: constant key
-Position  4 to 11: the key 
+Position  4 to 11: the key
 Position 12 to 13: the counter.
 Position 14 to 15: the nonce.
 
@@ -44,8 +44,8 @@ static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d
   x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
 }
 
-static void chacha_block(mi_random_ctx_t* ctx) 
-{  
+static void chacha_block(mi_random_ctx_t* ctx)
+{
   // scramble into `x`
   uint32_t x[16];
   for (size_t i = 0; i < 16; i++) {
@@ -72,8 +72,8 @@ static void chacha_block(mi_random_ctx_t* ctx)
   ctx->input[12] += 1;
   if (ctx->input[12] == 0) {
     ctx->input[13] += 1;
-    if (ctx->input[13] == 0) {  // and keep increasing into the nonce 
-      ctx->input[14] += 1;  
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
     }
   }
 }
@@ -83,7 +83,7 @@ static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
     chacha_block(ctx);
     ctx->output_available = 16; // (assign again to suppress static analysis warning)
   }
-  const uint32_t x = ctx->output[16 - ctx->output_available];  
+  const uint32_t x = ctx->output[16 - ctx->output_available];
   ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
   ctx->output_available--;
   return x;
@@ -94,9 +94,9 @@ static inline uint32_t read32(const uint8_t* p, size_t idx32) {
   return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
 }
 
-static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) 
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
 {
-  // since we only use chacha for randomness (and not encryption) we 
+  // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
   memset(ctx, 0, sizeof(*ctx));
@@ -110,7 +110,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   ctx->input[12] = 0;
   ctx->input[13] = 0;
   ctx->input[14] = (uint32_t)nonce;
-  ctx->input[15] = (uint32_t)(nonce >> 32);  
+  ctx->input[15] = (uint32_t)(nonce >> 32);
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
@@ -184,7 +184,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
   arc4random_buf(buf, buf_len);
   return true;
 }
-#elif defined(__linux__) 
+#elif defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
@@ -241,8 +241,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif
 
-static uintptr_t os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
   #if defined(_WIN32)
     LARGE_INTEGER pcount;
     QueryPerformanceCounter(&pcount);
@@ -267,10 +267,10 @@ static uintptr_t os_random_weak(uintptr_t extra_seed) {
 void _mi_random_init(mi_random_ctx_t* ctx) {
   uint8_t key[32];
   if (!os_random_buf(key, sizeof(key))) {
-    // if we fail to get random data from the OS, we fall back to a 
+    // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
     _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = os_random_weak(0);
+    uintptr_t x = _os_random_weak(0);
     for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
@@ -280,7 +280,7 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
 }
 
 /* --------------------------------------------------------
-test vectors from <https://tools.ietf.org/html/rfc8439> 
+test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
 /*
 static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
diff --git a/src/segment.c b/src/segment.c
index c7a9662b..0e70c3bf 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -17,9 +17,9 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 
 /* --------------------------------------------------------------------------------
   Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid 
-  splitting VMA's on Linux and reduce fragmentation on other OS's. 
-  Each thread owns its own segments. 
+  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.
 
   Currently we have:
   - small pages (64kb), 64 in one segment
@@ -154,14 +154,14 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
     if (!page->segment_in_use) {
-      nfree++;      
+      nfree++;
     }
     if (page->segment_in_use || page->is_reset) {
       mi_assert_expensive(!mi_pages_reset_contains(page, tld));
     }
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
-  mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
   mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
                      (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
   return true;
@@ -286,7 +286,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
   mi_assert_expensive(!mi_pages_reset_contains(page, tld));
   mi_assert_internal(_mi_page_segment(page)==segment);
   if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;  
+  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;
 
   if (mi_option_get(mi_option_reset_delay) == 0) {
     // reset immediately?
@@ -295,7 +295,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
   else {
     // otherwise push on the delayed page reset queue
     mi_page_queue_t* pq = &tld->pages_reset;
-    // push on top 
+    // push on top
     mi_page_reset_set_expire(page);
     page->next = pq->first;
     page->prev = NULL;
@@ -316,7 +316,7 @@ static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_page_queue_t* pq = &tld->pages_reset;
   mi_assert_internal(pq!=NULL);
   mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_pages_reset_contains(page, tld));  
+  mi_assert_internal(mi_pages_reset_contains(page, tld));
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == pq->last)  pq->last = page->prev;
@@ -332,19 +332,19 @@ static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool for
     if (!page->segment_in_use && page->is_committed && !page->is_reset) {
       mi_pages_reset_remove(page, tld);
       if (force_reset) {
-        mi_page_reset(segment, page, 0, tld); 
+        mi_page_reset(segment, page, 0, tld);
       }
     }
     else {
       mi_assert_internal(mi_page_not_in_queue(page,tld));
-    }    
+    }
   }
 }
 
 static void mi_reset_delayed(mi_segments_tld_t* tld) {
   if (!mi_option_is_enabled(mi_option_page_reset)) return;
   mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_reset;  
+  mi_page_queue_t* pq = &tld->pages_reset;
   // from oldest up to the first that has not expired yet
   mi_page_t* page = pq->last;
   while (page != NULL && mi_page_reset_is_expired(page,now)) {
@@ -358,7 +358,7 @@ static void mi_reset_delayed(mi_segments_tld_t* tld) {
   pq->last = page;
   if (page != NULL){
     page->next = NULL;
-  } 
+  }
   else {
     pq->first = NULL;
   }
@@ -540,7 +540,7 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
   }
   mi_assert_internal(tld->cache_count == 0);
   mi_assert_internal(tld->cache == NULL);
-#if MI_DEBUG>=2 
+#if MI_DEBUG>=2
   if (!_mi_is_main_thread()) {
     mi_assert_internal(tld->pages_reset.first == NULL);
     mi_assert_internal(tld->pages_reset.last == NULL);
@@ -684,7 +684,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
 
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
   UNUSED(force);
-  mi_assert(segment != NULL);  
+  mi_assert(segment != NULL);
   // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
   bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
   mi_pages_reset_remove_all_in_segment(segment, force_reset, tld);
@@ -716,7 +716,7 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {
 
 static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(_mi_page_segment(page) == segment);
-  mi_assert_internal(!page->segment_in_use);    
+  mi_assert_internal(!page->segment_in_use);
   // set in-use before doing unreset to prevent delayed reset
   mi_pages_reset_remove(page, tld);
   page->segment_in_use = true;
@@ -756,7 +756,7 @@ static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
 
 // clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld) 
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld)
 {
   mi_assert_internal(page->segment_in_use);
   mi_assert_internal(mi_page_all_free(page));
@@ -787,7 +787,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   segment->used--;
 
   // add to the free page list for reuse/reset
-  if (allow_reset) {  
+  if (allow_reset) {
     mi_pages_reset_add(segment, page, tld);
   }
 }
@@ -841,12 +841,12 @@ Note: the current implementation is one possible design;
 another way might be to keep track of abandoned segments
 in the regions. This would have the advantage of keeping
 all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to 
-scan abandoned segments efficiently in that case as they 
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
 would be spread among all other segments in the regions.
 ----------------------------------------------------------- */
 
-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers 
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
 // to put in a tag that increments on update to avoid the A-B-A problem.
 #define MI_TAGGED_MASK   MI_SEGMENT_MASK
 typedef uintptr_t        mi_tagged_segment_t;
@@ -862,7 +862,7 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 }
 
 // This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of 
+// this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
 static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
 
@@ -888,7 +888,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 }
 
 // Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void) 
+static bool mi_abandoned_visited_revisit(void)
 {
   // quick check if the visited list is empty
   if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
@@ -954,12 +954,12 @@ static mi_segment_t* mi_abandoned_pop(void) {
   segment = mi_tagged_segment_ptr(ts);
   if (mi_likely(segment == NULL)) {
     if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;  
+      return NULL;
     }
   }
 
   // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending, 
+  // a segment to be decommitted while a read is still pending,
   // and a tagged pointer to prevent A-B-A link corruption.
   // (this is called from `memory.c:_mi_mem_free` for example)
   mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
@@ -1024,7 +1024,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 ----------------------------------------------------------- */
 
 // Possibly clear pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free) 
+static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   bool has_page = false;
@@ -1032,17 +1032,17 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
   size_t pages_used_empty = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {      
+    if (page->segment_in_use) {
       pages_used++;
       // ensure used count is up to date and collect potential concurrent frees
-      _mi_page_free_collect(page, false); 
+      _mi_page_free_collect(page, false);
       if (mi_page_all_free(page)) {
         // if everything free already, page can be reused for some block size
         // note: don't clear the page yet as we can only OS reset it once it is reclaimed
         pages_used_empty++;
         has_page = true;
       }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {  
+      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
         has_page = true;
       }
@@ -1051,7 +1051,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
       // whole empty page
       has_page = true;
     }
-  }  
+  }
   mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
   if (all_pages_free != NULL) {
     *all_pages_free = ((pages_used - pages_used_empty) == 0);
@@ -1100,7 +1100,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
-    }   
+    }
     else if (page->is_committed && !page->is_reset) {  // not in-use, and not reset yet
       // note: do not reset as this includes pages that were not touched before
       // mi_pages_reset_add(segment, page, tld);
@@ -1141,17 +1141,17 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
       // free the segment (by forced reclaim) to make it available to other threads.
       // note1: we prefer to free a segment as that might lead to reclaiming another
       // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly 
+      // note2: we could in principle optimize this by skipping reclaim and directly
       // freeing but that would violate some invariants temporarily)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else if (has_page && segment->page_kind == page_kind) {
-      // found a free page of the right kind, or page of the right block_size with free space 
+      // found a free page of the right kind, or page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
       return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
     }
-    else if (segment->abandoned_visits >= 3) {  
+    else if (segment->abandoned_visits >= 3) {
       // always reclaim on 3rd visit to limit the list length.
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
@@ -1165,12 +1165,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 
 
 /* -----------------------------------------------------------
-   Reclaim or allocate  
+   Reclaim or allocate
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  mi_assert_internal(page_kind <= MI_PAGE_LARGE);  
+  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   // 1. try to get a segment from our cache
   mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
@@ -1191,7 +1191,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
     return segment;
   }
   // 3. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);  
+  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
 }
 
 
@@ -1216,11 +1216,11 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t*
 // Allocate a page inside a segment. Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
-  return mi_segment_find_free(segment, tld);  
+  return mi_segment_find_free(segment, tld);
 }
 
 static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  // find an available segment the segment free queue 
+  // find an available segment the segment free queue
   mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
   if (mi_segment_queue_is_empty(free_queue)) {
     // possibly allocate or reclaim a fresh segment
@@ -1275,7 +1275,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
 }
 
 /* -----------------------------------------------------------
-   Page allocation 
+   Page allocation
 ----------------------------------------------------------- */
 
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
diff --git a/test/test-stress.c b/test/test-stress.c
index 1b559a59..8958933e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -20,7 +20,7 @@ terms of the MIT license.
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
-#include <mimalloc.h>
+// #include <mimalloc.h>
 
 // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
 //
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifdef USE_STD_MALLOC
+#ifndef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)
@@ -188,7 +188,7 @@ static void test_stress(void) {
         free_items(p);
       }
     }
-    mi_collect(false);
+    // mi_collect(false);
 #ifndef NDEBUG
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
@@ -206,7 +206,7 @@ static void leak(intptr_t tid) {
   }
 }
 
-static void test_leak(void) {  
+static void test_leak(void) {
   for (int n = 0; n < ITER; n++) {
     run_os_threads(THREADS, &leak);
     mi_collect(false);
@@ -242,15 +242,15 @@ int main(int argc, char** argv) {
 
   // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
   srand(0x7feb352d);
-  mi_stats_reset();
+  // mi_stats_reset();
 #ifdef STRESS
     test_stress();
 #else
     test_leak();
-#endif  
+#endif
 
-  mi_collect(true);
-  mi_stats_print(NULL);
+  // mi_collect(true);
+  // mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }
@@ -262,7 +262,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;
 
 #include <windows.h>
 
-static DWORD WINAPI thread_entry(LPVOID param) {  
+static DWORD WINAPI thread_entry(LPVOID param) {
   thread_entry_fun((intptr_t)param);
   return 0;
 }

From ed1c8a203ab0ce9df97919767d01bc3f180ec2f1 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Wed, 29 Jan 2020 23:08:12 -0800
Subject: [PATCH 02/17] improve performance with tls recursion counter

---
 include/mimalloc-internal.h | 19 +++++++++++--------
 src/init.c                  | 23 +++++++++++++++--------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index f4b578f6..b2e57aec 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -275,24 +275,27 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 
 #ifdef MI_TLS_RECURSE_GUARD
 extern mi_heap_t* _mi_get_default_heap_tls_safe(void);
+extern size_t _mi_tls_recurse;
+#endif
+
 static inline mi_heap_t* mi_get_default_heap(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_tls_recurse++>100) {
   // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
   // to initialize thread local data. To avoid recursion, we need to avoid
   // accessing the thread local `_mi_default_heap` until our module is loaded
   // and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  return _mi_get_default_heap_tls_safe();
-#else
-
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-
-static inline mi_heap_t* mi_get_default_heap(void) {
+    mi_heap_t* heap = _mi_get_default_heap_tls_safe();
+    _mi_tls_recurse = 0;
+    return heap;
+  }
+  #endif
   return _mi_heap_default;
-
-#endif
 }
 
 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 922b7438..750be169 100644
--- a/src/init.c
+++ b/src/init.c
@@ -266,9 +266,8 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #endif
 
 // Set up handlers so `mi_thread_done` is called automatically
-static bool tls_initialized = false; // fine if it races
-
 static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -319,9 +318,6 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #ifndef MI_TLS_RECURSE_GUARD
-  _mi_heap_default = heap;
-  #endif
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
@@ -332,8 +328,18 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #elif defined(MI_USE_PTHREADS)
     pthread_setspecific(mi_pthread_key, heap);
   #endif
+  if (_mi_tls_recurse < 100) {
+    _mi_heap_default = heap;
+  }
 }
 
+#ifdef MI_TLS_RECURSE_GUARD
+// initialize high so the first call uses safe TLS
+size_t _mi_tls_recurse = 10000;
+#else
+size_t _mi_tls_recurse = 0;
+#endif
+
 mi_heap_t* _mi_get_default_heap_tls_safe(void) {
   if (mi_unlikely(mi_pthread_key==0)) return (mi_heap_t*)&_mi_heap_empty;
   mi_heap_t* heap = pthread_getspecific(mi_pthread_key);
@@ -347,7 +353,6 @@ static void mi_process_done(void);
 
 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
-bool _mi_tls_initialized = false;
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool _mi_preloading() {
@@ -395,7 +400,7 @@ static void mi_process_load(void) {
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
   os_preloading = false;
-  _mi_tls_initialized = true;
+  _mi_heap_set_default_direct(&_mi_heap_main);
   atexit(&mi_process_done);
   _mi_options_init();
   mi_process_init();
@@ -414,7 +419,9 @@ void _mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
     _mi_heap_main.thread_id = _mi_thread_id();
     _mi_heap_main.cookie = _os_random_weak((uintptr_t)&_mi_heap_main_init);
-    _mi_random_init(&_mi_heap_main.random);
+  }
+  if (_mi_tls_recurse < 100) {
+     _mi_random_init(&_mi_heap_main.random);
     _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
   }

From fea903900d7f40c1c9af4f9059dc2fbfaa6a187c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 14:33:24 -0800
Subject: [PATCH 03/17] use __thread locals on linux

---
 include/mimalloc-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 872c5269..7173a189 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-types.h"
 
 #if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__) || defined(__linux__)
+#if defined(__APPLE__)
 #include <pthread.h>
 #define MI_TLS_PTHREADS
 #elif (defined(__OpenBSD__) || defined(__DragonFly__))

From 0989562c2d87aa77f33e590357501fc9d2d485bc Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 16:57:00 -0800
Subject: [PATCH 04/17] add initial fast tls for macOSX

---
 include/mimalloc-internal.h | 32 ++++++++++++++++++++++++++------
 src/init.c                  | 19 +++++++++++++------
 test/test-stress.c          |  4 ++--
 3 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 7173a189..0e3ebed8 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -11,7 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-types.h"
 
 #if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__)
+#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
+#define MI_TLS_OSX_FAST
+#define MI_TLS_OSX_SLOT     94   // seems unused, except in Webkit? See: <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
+#elif defined(__APPLE__)
 #include <pthread.h>
 #define MI_TLS_PTHREADS
 #elif (defined(__OpenBSD__) || defined(__DragonFly__))
@@ -284,14 +287,31 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
-#if defined(MI_TLS_PTHREADS)
+#if defined(MI_TLS_OSX_FAST)
+#define MI_TLS_OSX_OFFSET  (MI_TLS_OSX_SLOT*sizeof(void*))
+static inline void* mi_tls_osx_fast_get(void) {
+  void* ret;
+  __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET)));
+  return ret;
+}
+static inline void mi_tls_osx_fast_set(void* value) {
+  __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value));
+}
+#elif defined(MI_TLS_PTHREADS)
 extern pthread_key_t  _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_PTHREADS)
+#if defined(MI_TLS_OSX_FAST) 
+  // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc).
+  // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) <https://github.com/apportable/Foundation/blob/master/System/System/src/pthread_machdep.h>
+  // which seems unused except for the more recent Webkit <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
+  // Use with care.
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREADS)
   // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc` 
   // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed.
   // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good).
@@ -300,9 +320,9 @@ static inline mi_heap_t* mi_get_default_heap(void) {
 #else
   #if defined(MI_TLS_RECURSE_GUARD)
   // On some BSD platforms, like openBSD, the dynamic loader calls `malloc`
-  // to initialize thread local data. To avoid recursion, we need to avoid
-  // accessing the thread local `_mi_default_heap` until our module is loaded
-  // and use the statically allocated main heap until that time.
+  // to initialize thread local data (before our module is loaded). 
+  // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
+  // until our module is loaded and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
   if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main;
   #endif
diff --git a/src/init.c b/src/init.c
index 431b7fee..960cccf1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -260,14 +260,15 @@ static void _mi_thread_done(mi_heap_t* default_heap);
   // use thread local storage keys to detect thread ending
   #include <windows.h>
   #include <fibersapi.h>
-  static DWORD mi_fls_key;
+  static DWORD mi_fls_key = (DWORD)(-1);
   static void NTAPI mi_fls_done(PVOID value) {
     if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(MI_USE_PTHREADS)
-  // use pthread locol storage keys to detect thread ending
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
   #include <pthread.h>
-  pthread_key_t _mi_heap_default_key;
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
   static void mi_pthread_done(void* value) {
     if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
@@ -287,6 +288,7 @@ static void mi_process_setup_auto_thread_done(void) {
   #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
     mi_fls_key = FlsAlloc(&mi_fls_done);
   #elif defined(MI_USE_PTHREADS)
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
     pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
   #endif
   _mi_heap_set_default_direct(&_mi_heap_main);
@@ -331,9 +333,14 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #if !defined(MI_TLS_PTHREADS)
+  #if defined(MI_TLS_OSX_FAST)
+  mi_tls_osx_fast_set(heap);
+  #elif defined(MI_TLS_PTHREADS)
+  // we use _mi_heap_default_key
+  #else
   _mi_heap_default = heap;
-  #endif  
+  #endif
+
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -342,7 +349,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
     mi_assert_internal(mi_fls_key != 0);
     FlsSetValue(mi_fls_key, heap);
   #elif defined(MI_USE_PTHREADS)
-    // mi_assert_internal(_mi_heap_default_key != 0); // often 0 is also the allocated key
+    mi_assert_internal(_mi_heap_default_key != (pthread_key_t)(-1)); 
     pthread_setspecific(_mi_heap_default_key, heap);
   #endif
 }
diff --git a/test/test-stress.c b/test/test-stress.c
index 1bfc5012..7d8993a0 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -27,7 +27,7 @@ terms of the MIT license.
 // argument defaults
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 10;      // scaling factor
-static int ITER    =  5;      // N full iterations destructing and re-creating all threads
+static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int SCALE   = 100;  // scaling factor
@@ -250,7 +250,7 @@ int main(int argc, char** argv) {
 #endif
 
   // mi_collect(true);
-  // mi_stats_print(NULL);
+  mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }

From d2db9f1fc26e9545bcacfb35376ccda473adf803 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 13:12:22 -0800
Subject: [PATCH 05/17] update thread local storage

---
 include/mimalloc-internal.h | 133 +++++++++++++++++++++---------------
 src/init.c                  |  20 ++++--
 2 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 0e3ebed8..0669048e 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -10,18 +10,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc-types.h"
 
-#if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
-#define MI_TLS_OSX_FAST
-#define MI_TLS_OSX_SLOT     94   // seems unused, except in Webkit? See: <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
-#elif defined(__APPLE__)
-#include <pthread.h>
-#define MI_TLS_PTHREADS
-#elif (defined(__OpenBSD__) || defined(__DragonFly__))
-#define MI_TLS_RECURSE_GUARD
-#endif
-#endif
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
@@ -284,47 +272,53 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 ----------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
-#if defined(MI_TLS_OSX_FAST)
-#define MI_TLS_OSX_OFFSET  (MI_TLS_OSX_SLOT*sizeof(void*))
-static inline void* mi_tls_osx_fast_get(void) {
-  void* ret;
-  __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET)));
-  return ret;
+#if defined(MI_MALLOC_OVERRIDE) 
+// On some systems, MacOSX, OpenBSD, and DragonFly, accessing a thread local variable leads to recursion
+// as the access invokes malloc. We avoid this by stealing a TLS slot from the OS internal slots so no
+// allocation is involved. On OSX we use the direct TLS slots, while on the BSD's we use space in the `pthread_t` structure.
+#if defined(__MACH__) // OSX
+#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+                                      // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+#elif defined(__OpenBSD__) 
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#elif defined(__DragonFly__)
+#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot);   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
-static inline void mi_tls_osx_fast_set(void* value) {
-  __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value));
-}
-#elif defined(MI_TLS_PTHREADS)
-extern pthread_key_t  _mi_heap_default_key;
+#elif defined(MI_TLS_PTHREAD)
+extern pthread_key_t _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_OSX_FAST) 
-  // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc).
-  // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) <https://github.com/apportable/Foundation/blob/master/System/System/src/pthread_machdep.h>
-  // which seems unused except for the more recent Webkit <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
-  // Use with care.
-  mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get();
+#if defined(MI_TLS_SLOT) 
+  // Use steal a fixed slot in the TLS on MacOSX to avoid recursion (since the loader calls malloc).
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREADS)
-  // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc` 
-  // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed.
-  // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good).
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? (mi_heap_t*)&_mi_heap_empty : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
   #if defined(MI_TLS_RECURSE_GUARD)
-  // On some BSD platforms, like openBSD, the dynamic loader calls `malloc`
-  // to initialize thread local data (before our module is loaded). 
   // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
   // until our module is loaded and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main;
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
 #endif
@@ -344,6 +338,7 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }
 
 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
   mi_assert_internal(_mi_heap_main.cookie != 0);
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
@@ -669,24 +664,54 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
 }
-#elif (defined(__GNUC__) || defined(__clang__)) && \
+
+#elif defined(__GNUC__) && \
       (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
-// TLS register on x86 is in the FS or GS register
-// see: https://akkadia.org/drepper/tls.pdf
+
+// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  res = tcb[slot];
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  res = tcb[slot];
+#endif
+  return res;
+}
+
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  tcb[slot] = value;
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  tcb[slot] = value;
+#endif
+}
+
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  uintptr_t tid;
-  #if defined(__i386__)
-  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
-  #elif defined(__MACH__)
-  __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__)
-  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
-  #elif defined(__aarch64__)
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
-  #endif
-  return tid;
+  // normally, slot 0 is the pointer to the thread control block
+  return (uintptr_t)mi_tls_slot(0);
 }
 #else
 // otherwise use standard C
diff --git a/src/init.c b/src/init.c
index 960cccf1..f59daa9e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -107,6 +107,8 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
 
+extern mi_heap_t _mi_heap_main;
+
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
@@ -146,6 +148,11 @@ static void mi_heap_main_init(void) {
   }
 }
 
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
 
 /* -----------------------------------------------------------
   Initialization and freeing of the thread local heaps
@@ -333,9 +340,11 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #if defined(MI_TLS_OSX_FAST)
-  mi_tls_osx_fast_set(heap);
-  #elif defined(MI_TLS_PTHREADS)
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
   // we use _mi_heap_default_key
   #else
   _mi_heap_default = heap;
@@ -406,13 +415,16 @@ static void mi_allocator_done() {
 
 // Called once by the process loader
 static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
+  #endif
   os_preloading = false;  
   atexit(&mi_process_done);
   _mi_options_init();  
   mi_process_init();
-  //mi_stats_reset();
+  //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)

From 8bc20631e47b8c0ec79efb5f2452e958bffb4558 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 13:25:26 -0800
Subject: [PATCH 06/17] fixes for freeBSD

---
 include/mimalloc-internal.h | 4 +++-
 src/init.c                  | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 0669048e..cfbdc9ca 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -292,11 +292,13 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #if defined(MI_TLS_SLOT)
 static inline void* mi_tls_slot(size_t slot);   // forward declaration
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+#include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
+#include <pthread.h>
 extern pthread_key_t _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
@@ -308,7 +310,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  mi_heap_t* heap = mi_tls_pthread_heap_slot();
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD)
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
diff --git a/src/init.c b/src/init.c
index f59daa9e..b7f329cb 100644
--- a/src/init.c
+++ b/src/init.c
@@ -168,7 +168,7 @@ typedef struct mi_thread_data_s {
 static bool _mi_heap_init(void) {
   if (mi_heap_is_initialized(mi_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
-    mi_assert_internal(_mi_heap_main.thread_id != 0);
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
     _mi_heap_set_default_direct(&_mi_heap_main);
@@ -358,8 +358,9 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
     mi_assert_internal(mi_fls_key != 0);
     FlsSetValue(mi_fls_key, heap);
   #elif defined(MI_USE_PTHREADS)
-    mi_assert_internal(_mi_heap_default_key != (pthread_key_t)(-1)); 
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
     pthread_setspecific(_mi_heap_default_key, heap);
+  }
   #endif
 }
 

From 07fbe4f80f04a417bb19ac83113f73e1d1db3393 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 14:31:28 -0800
Subject: [PATCH 07/17] fixes for dragonfly

---
 include/mimalloc-internal.h | 7 +++++++
 src/options.c               | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index cfbdc9ca..b11cb5fe 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -285,6 +285,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #elif defined(__OpenBSD__) 
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
 #elif defined(__DragonFly__)
+#warning "mimalloc is not working correctly on DragonFly yet."
 #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
 #endif
 #endif
@@ -295,6 +296,12 @@ static inline void* mi_tls_slot(size_t slot);   // forward declaration
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    static mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif  
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
diff --git a/src/options.c b/src/options.c
index ec58c31c..0af4a485 100644
--- a/src/options.c
+++ b/src/options.c
@@ -70,7 +70,11 @@ static mi_option_desc_t options[_mi_option_last] =
   { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+#if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#endif
   { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose

From 12c4108abe44ac5e084e9d12ee4dba8c7718ba24 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 16:09:09 -0800
Subject: [PATCH 08/17] update comments

---
 include/mimalloc-internal.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index b11cb5fe..75aea2e2 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -267,18 +267,25 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }
 
 
-/* -----------------------------------------------------------
-  The thread local default heap
------------------------------------------------------------ */
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` return the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a 
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks). 
+On some platforms though we cannot use that when overriding `malloc` since the underlying 
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. 
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: not yet working.
+------------------------------------------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
 extern bool _mi_process_is_initialized;
 mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
 #if defined(MI_MALLOC_OVERRIDE) 
-// On some systems, MacOSX, OpenBSD, and DragonFly, accessing a thread local variable leads to recursion
-// as the access invokes malloc. We avoid this by stealing a TLS slot from the OS internal slots so no
-// allocation is involved. On OSX we use the direct TLS slots, while on the BSD's we use space in the `pthread_t` structure.
 #if defined(__MACH__) // OSX
 #define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
@@ -313,7 +320,6 @@ extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate
 
 static inline mi_heap_t* mi_get_default_heap(void) {
 #if defined(MI_TLS_SLOT) 
-  // Use steal a fixed slot in the TLS on MacOSX to avoid recursion (since the loader calls malloc).
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
@@ -323,10 +329,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
-  #if defined(MI_TLS_RECURSE_GUARD)
-  // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
-  // until our module is loaded and use the statically allocated main heap until that time.
-  // TODO: patch ourselves dynamically to avoid this check every time?
+  #if defined(MI_TLS_RECURSE_GUARD)  
   if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
@@ -662,9 +665,8 @@ static inline size_t _mi_os_numa_node_count(void) {
 
 
 // -------------------------------------------------------------------
-// Getting the thread id should be performant
-// as it is called in the fast path of `_mi_free`,
-// so we specialize for various platforms.
+// Getting the thread id should be performant as it is called in the 
+// fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -699,6 +701,7 @@ static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
   return res;
 }
 
+// setting is only used on macOSX for now
 static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
   const size_t ofs = (slot*sizeof(void*));
 #if defined(__i386__)
@@ -719,7 +722,7 @@ static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
 }
 
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  // normally, slot 0 is the pointer to the thread control block
+  // in all our targets, slot 0 is the pointer to the thread control block
   return (uintptr_t)mi_tls_slot(0);
 }
 #else

From 757dcc84115eeccb93ff23e177851c6d0d88f8ea Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sun, 2 Feb 2020 19:07:26 -0800
Subject: [PATCH 09/17] extend interpose for macOSX

---
 include/mimalloc-internal.h | 24 ++++++++++++------------
 src/alloc-override.c        | 17 +++++++++--------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 75aea2e2..37722cd9 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -269,11 +269,11 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 
 /* ----------------------------------------------------------------------------------------
 The thread local default heap: `_mi_get_default_heap` return the thread local heap.
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a 
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
 __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks). 
-On some platforms though we cannot use that when overriding `malloc` since the underlying 
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. 
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
 We try to circumvent this in an efficient way:
 - macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
            loader itself calls `malloc` even before the modules are initialized.
@@ -285,11 +285,11 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern bool _mi_process_is_initialized;
 mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
-#if defined(MI_MALLOC_OVERRIDE) 
+#if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#define MI_TLS_SLOT               84  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
-#elif defined(__OpenBSD__) 
+#elif defined(__OpenBSD__)
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
 #elif defined(__DragonFly__)
 #warning "mimalloc is not working correctly on DragonFly yet."
@@ -299,7 +299,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_TLS_SLOT)
 static inline void* mi_tls_slot(size_t slot);   // forward declaration
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
@@ -308,7 +308,7 @@ static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
     static mi_heap_t* pheap_main = _mi_heap_main_get();
     return &pheap_main;
   }
-  #endif  
+  #endif
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
@@ -319,7 +319,7 @@ extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_SLOT) 
+#if defined(MI_TLS_SLOT)
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
@@ -329,7 +329,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
-  #if defined(MI_TLS_RECURSE_GUARD)  
+  #if defined(MI_TLS_RECURSE_GUARD)
   if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
@@ -665,7 +665,7 @@ static inline size_t _mi_os_numa_node_count(void) {
 
 
 // -------------------------------------------------------------------
-// Getting the thread id should be performant as it is called in the 
+// Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 58996c5f..c0fdf161 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -41,26 +41,27 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
-  static void mi_free_tls_safe(void* p) {
-    if (mi_unlikely(_mi_preloading())) return;
-    mi_free(p);
-  }
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
     const void* replacement;
     const void* target;
   };
-  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
-  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
   __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSEX(free,mi_free_tls_safe),
     MI_INTERPOSE_MI(strdup),
-    MI_INTERPOSE_MI(strndup)
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
   };
 #elif defined(_MSC_VER)
   // cannot override malloc unless using a dll.

From f3c47c7c91801c712db08d6944503132defef039 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sun, 2 Feb 2020 21:03:09 -0800
Subject: [PATCH 10/17] improved malloc zone handling on macOSX (not working
 yet)

---
 include/mimalloc-internal.h |  2 +-
 src/alloc-override-osx.c    | 24 +++++++++++++++++++++++-
 src/alloc-override.c        | 14 +++++++-------
 src/alloc.c                 | 16 ++++++++--------
 src/init.c                  |  8 ++++----
 test/test-stress.c          |  2 +-
 6 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 37722cd9..4ac7da78 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -298,7 +298,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #endif
 
 #if defined(MI_TLS_SLOT)
-static inline void* mi_tls_slot(size_t slot);   // forward declaration
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index d4f8b06d..ed0bc2de 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -14,6 +14,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should only be included on macOS"
 #endif
 
+#warning "malloc zones do not seem to work for now; use MI_INTERPOSE instead"
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
@@ -35,34 +36,42 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im
 ------------------------------------------------------ */
 
 static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  UNUSED(zone); UNUSED(p);
   return 0; // as we cannot guarantee that `p` comes from us, just return 0
 }
 
 static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_malloc(size);
 }
 
 static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  UNUSED(zone);
   return mi_calloc(count, size);
 }
 
 static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_malloc_aligned(size, _mi_os_page_size());
 }
 
 static void zone_free(malloc_zone_t* zone, void* p) {
+  UNUSED(zone);
   return mi_free(p);
 }
 
 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  UNUSED(zone);
   return mi_realloc(p, newsize);
 }
 
 static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  UNUSED(zone);
   return mi_malloc_aligned(size,alignment);
 }
 
 static void zone_destroy(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo: ignore for now?
 }
 
@@ -83,11 +92,13 @@ static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
 }
 
 static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone); UNUSED(size);
   mi_collect(false);
   return 0;
 }
 
 static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  UNUSED(size);
   zone_free(zone,p);
 }
 
@@ -102,34 +113,43 @@ static kern_return_t intro_enumerator(task_t task, void* p,
                             vm_range_recorder_t recorder)
 {
   // todo: enumerate all memory
+  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
+  UNUSED(reader); UNUSED(recorder);
   return KERN_SUCCESS;
 }
 
 static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_good_size(size);
 }
 
 static boolean_t intro_check(malloc_zone_t* zone) {
+  UNUSED(zone);
   return true;
 }
 
 static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  UNUSED(zone); UNUSED(verbose);
   mi_stats_print(NULL);
 }
 
 static void intro_log(malloc_zone_t* zone, void* p) {
+  UNUSED(zone); UNUSED(p);
   // todo?
 }
 
 static void intro_force_lock(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo?
 }
 
 static void intro_force_unlock(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo?
 }
 
 static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  UNUSED(zone);
   // todo...
   stats->blocks_in_use = 0;
   stats->size_in_use = 0;
@@ -138,6 +158,7 @@ static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
 }
 
 static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  UNUSED(zone);
   return false;
 }
 
@@ -161,7 +182,6 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -201,6 +221,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   zone.free_definite_size = &zone_free_definite_size;
   zone.pressure_relief = &zone_pressure_relief;
   intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;
 
   // force the purgeable zone to exist to avoid strange bugs
   if (malloc_default_purgeable_zone) {
@@ -225,6 +246,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
     malloc_zone_unregister(purgeable_zone);
     malloc_zone_register(purgeable_zone);
   }
+
 }
 
 #endif // MI_MALLOC_OVERRIDE
diff --git a/src/alloc-override.c b/src/alloc-override.c
index c0fdf161..151c2333 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32) || (defined(__MACH__) && !defined(MI_INTERPOSE)))
 
 // ------------------------------------------------------
 // Override system malloc
@@ -68,10 +68,10 @@ terms of the MIT license. A copy of the license can be found in the file
   // we just override new/delete which does work in a static library.
 #else
   // On all other systems forward to our API
-  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size);
-  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n);
-  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize);
-  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p);
+  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size);
+  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n);
+  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize);
+  void  free(void* p)                    MI_FORWARD0(mi_free, p);
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
@@ -99,8 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file
   void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
-  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
+  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
   #endif
 
   #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
diff --git a/src/alloc.c b/src/alloc.c
index 61f34353..d2fbe4b1 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -212,7 +212,7 @@ static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* bl
   size_t delta;
   bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
   mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0); 
+  return (ok ? bsize - delta : 0);
 }
 
 static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
@@ -259,7 +259,7 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
   mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
   padding->delta = (uint32_t)new_delta;
 }
-#else 
+#else
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   UNUSED(page);
   UNUSED(block);
@@ -359,7 +359,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
     }
     else if (mi_unlikely(mi_page_is_in_full(page))) {
       _mi_page_unfull(page);
-    }    
+    }
   }
   else {
     _mi_free_block_mt(page,block);
@@ -401,7 +401,7 @@ void mi_free(void* p) mi_attr_noexcept
       "(this may still be a valid very large allocation (over 64MiB))\n", p);
     if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
       _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+    } 
   }
 #endif
 #if (MI_DEBUG!=0 || MI_SECURE>=4)
@@ -421,11 +421,11 @@ void mi_free(void* p) mi_attr_noexcept
   mi_heap_stat_decrease(heap, malloc, bsize);
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
     mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
-  }  
+  }
 #endif
 
   if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned    
+    // local, and not full or aligned
     if (mi_unlikely(mi_check_is_double_free(page,block))) return;
     mi_check_padding(page, block);
     #if (MI_DEBUG!=0)
@@ -436,7 +436,7 @@ void mi_free(void* p) mi_attr_noexcept
     page->used--;
     if (mi_unlikely(mi_page_all_free(page))) {
       _mi_page_retire(page);
-    }    
+    }
   }
   else {
     // non-local, aligned blocks, or a full page; use the more generic path
@@ -473,7 +473,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
   const mi_segment_t* const segment = _mi_ptr_segment(p);
   const mi_page_t* const page = _mi_segment_page_of(segment, p);
   const mi_block_t* const block = (const mi_block_t*)p;
-  const size_t size = mi_page_usable_size_of(page, block);  
+  const size_t size = mi_page_usable_size_of(page, block);
   if (mi_unlikely(mi_page_has_aligned(page))) {
     ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
diff --git a/src/init.c b/src/init.c
index b7f329cb..2f5ca224 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
 
 #if defined(MI_PADDING) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
-#elif defined(MI_PADDING) 
+#elif defined(MI_PADDING)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
@@ -190,7 +190,7 @@ static bool _mi_heap_init(void) {
     heap->cookie  = _mi_heap_random_next(heap) | 1;
     heap->keys[0] = _mi_heap_random_next(heap);
     heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
@@ -421,9 +421,9 @@ static void mi_process_load(void) {
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
   #endif
-  os_preloading = false;  
+  os_preloading = false;
   atexit(&mi_process_done);
-  _mi_options_init();  
+  _mi_options_init();
   mi_process_init();
   //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
diff --git a/test/test-stress.c b/test/test-stress.c
index 7d8993a0..f1c8b2e1 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifdef USE_STD_MALLOC
+#ifndef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)

From 1c2e0a47cada2cd689f34db18b28ca41a53cc1f6 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:04:53 -0800
Subject: [PATCH 11/17] fix noexcept attribute on array delete operators

---
 include/mimalloc-new-delete.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-new-delete.h b/include/mimalloc-new-delete.h
index 050f9433..fded0c04 100644
--- a/include/mimalloc-new-delete.h
+++ b/include/mimalloc-new-delete.h
@@ -32,8 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
   void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) { mi_free_size(p,n); };
-  void operator delete[](void* p, std::size_t n) { mi_free_size(p,n); };
+  void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
+  void operator delete[](void* p, std::size_t n) noexcept { mi_free_size(p,n); };
   #endif
 
   #if (__cplusplus > 201402L || defined(__cpp_aligned_new))

From b241be7075c32bd3952f4d9f7eb22c6531b8397e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:08:33 -0800
Subject: [PATCH 12/17] reenable mimalloc in the stress test

---
 test/test-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index f1c8b2e1..7d8993a0 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifndef USE_STD_MALLOC
+#ifdef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)

From 3560e0a867a82b6a593a01ac4995c11498f0a167 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:15:09 -0800
Subject: [PATCH 13/17] fix TLS slot number on OSX

---
 include/mimalloc-internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 4ac7da78..b2297c50 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -268,7 +268,7 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 
 
 /* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_get_default_heap` return the thread local heap.
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
 On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
 __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
 that the storage will always be available (allocated on the thread stacks).
@@ -287,7 +287,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               84  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
 #elif defined(__OpenBSD__)
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>

From a96e94f940db7d844030239bfbedd004d5915657 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:46:38 -0800
Subject: [PATCH 14/17] change TLS slot on OpenBSD

---
 include/mimalloc-internal.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index b2297c50..cea6b9c3 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -287,10 +287,13 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-                                      // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+#define MI_TLS_SLOT               89  // seems unused? 
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
 #elif defined(__OpenBSD__)
-#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
 #elif defined(__DragonFly__)
 #warning "mimalloc is not working correctly on DragonFly yet."
 #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>

From e67606210326c838b8fa3004a83721df4d3c6dbe Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 5 Feb 2020 17:40:13 -0800
Subject: [PATCH 15/17] update mac zone code

---
 src/alloc-override-osx.c | 67 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index ed0bc2de..99c6a134 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -182,6 +182,72 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
+// directly overwrite the default zone as per:
+// <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
+
+static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
+{
+  static malloc_introspection_t intro;
+  memset(&intro, 0, sizeof(intro));
+
+  intro.enumerator = &intro_enumerator;
+  intro.good_size = &intro_good_size;
+  intro.check = &intro_check;
+  intro.print = &intro_print;
+  intro.log = &intro_log;
+  intro.force_lock = &intro_force_lock;
+  intro.force_unlock = &intro_force_unlock;
+
+  static malloc_zone_t oldzone;
+  static malloc_zone_t* zone = malloc_default_zone(); // get the `malloc` backing default zone
+  if (zone == NULL) return;
+
+  // save the default zone in oldzone
+  memset(&oldzone, 0, sizeof(oldzone));
+  if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
+
+  // overwrite default zone functions in-place
+  zone->zone_name = "mimalloc";
+  zone->size = &zone_size;
+  zone->introspect = &intro;
+  zone->malloc = &zone_malloc;
+  zone->calloc = &zone_calloc;
+  zone->valloc = &zone_valloc;
+  zone->free = &zone_free;
+  zone->realloc = &zone_realloc;
+  zone->destroy = &zone_destroy;
+  zone->batch_malloc = &zone_batch_malloc;
+  zone->batch_free = &zone_batch_free;
+
+  malloc_zone_t* purgeable_zone = NULL;
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // switch to version 9 on OSX 10.6 to support memalign.
+  // zone->version = 9;
+  zone->memalign = &zone_memalign;
+  zone->free_definite_size = &zone_free_definite_size;
+  zone->pressure_relief = &zone_pressure_relief;
+  intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;
+  /*
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+  */
+#endif
+  /*
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+  */
+}
+
+/*
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -248,5 +314,6 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
+*/
 
 #endif // MI_MALLOC_OVERRIDE

From 9062f397649da3b4851d9107cc5a2b01021faff5 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sat, 8 Feb 2020 20:08:52 -0800
Subject: [PATCH 16/17] enable interpose separate from zones on macOS

---
 CMakeLists.txt           | 16 +++++++++++-----
 src/alloc-override-osx.c | 20 ++++++++++++++++----
 src/alloc-override.c     |  2 +-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2da7974b..e16830aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,11 +5,12 @@ set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
 
 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
-option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
 option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
 option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
+option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
+option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" OFF) # enables interpose as well
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
@@ -61,14 +62,19 @@ endif()
 if(MI_OVERRIDE MATCHES "ON")
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
+    if(MI_OSX_ZONE MATCHES "ON")
+      # use zone's on macOS
+      message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
+      list(APPEND mi_sources src/alloc-override-osx.c)
+      if(NOT MI_INTERPOSE MATCHES "ON")
+        message(STATUS "  (enabling INTERPOSE as well since zone's require this)")
+        set(MI_INTERPOSE "ON")
+      endif()
+    endif()
     if(MI_INTERPOSE MATCHES "ON")
       # use interpose on macOS
       message(STATUS "  Use interpose to override malloc (MI_INTERPOSE=ON)")
       list(APPEND mi_defines MI_INTERPOSE)
-    else()
-      # use zone's on macOS
-      message(STATUS "  Use zone's to override malloc (MI_INTERPOSE=OFF)")
-      list(APPEND mi_sources src/alloc-override-osx.c)
     endif()
   endif()
 endif()
diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index 99c6a134..92d5ce2b 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -14,7 +14,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should only be included on macOS"
 #endif
 
-#warning "malloc zones do not seem to work for now; use MI_INTERPOSE instead"
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
@@ -182,8 +181,10 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
+#if 0
 // directly overwrite the default zone as per:
 // <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
+#include <mach/mach.h>
 
 static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
 {
@@ -199,13 +200,18 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   intro.force_unlock = &intro_force_unlock;
 
   static malloc_zone_t oldzone;
-  static malloc_zone_t* zone = malloc_default_zone(); // get the `malloc` backing default zone
+  static malloc_zone_t* zone;
+  zone = mi_get_default_zone(); // get the `malloc` backing default zone
   if (zone == NULL) return;
 
   // save the default zone in oldzone
   memset(&oldzone, 0, sizeof(oldzone));
   if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
 
+  if (zone->version >= 8) {
+    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
+               VM_PROT_READ|VM_PROT_WRITE);
+  }
   // overwrite default zone functions in-place
   zone->zone_name = "mimalloc";
   zone->size = &zone_size;
@@ -237,6 +243,11 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   }
   */
 #endif
+  if (zone->version >= 8) {
+    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
+               VM_PROT_READ);
+  }
+
   /*
   // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
   // earlier than the default zone.
@@ -247,7 +258,8 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   */
 }
 
-/*
+#else
+
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -314,6 +326,6 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
-*/
+#endif
 
 #endif // MI_MALLOC_OVERRIDE
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 151c2333..c0e7bc2b 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32) || (defined(__MACH__) && !defined(MI_INTERPOSE)))
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__MACH__) && !defined(MI_INTERPOSE)))
 
 // ------------------------------------------------------
 // Override system malloc

From afe434463ac92bc140691c55c3922a53f4324bfb Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 9 Feb 2020 18:26:50 -0800
Subject: [PATCH 17/17] add comments on overriding in macOSX

---
 src/alloc-override-osx.c | 86 +++-------------------------------------
 1 file changed, 6 insertions(+), 80 deletions(-)

diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index 92d5ce2b..cc03f5e2 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -17,6 +17,12 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
+   It seems we also need to interpose (see `alloc-override.c`)
+   or otherwise we get zone errors as there are usually 
+   already allocations done by the time we take over the 
+   zone. Unfortunately, that means we need to replace
+   the `free` with a checked free (`cfree`) impacting 
+   performance.
 ------------------------------------------------------ */
 
 #include <AvailabilityMacros.h>
@@ -181,85 +187,6 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
-#if 0
-// directly overwrite the default zone as per:
-// <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
-#include <mach/mach.h>
-
-static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
-{
-  static malloc_introspection_t intro;
-  memset(&intro, 0, sizeof(intro));
-
-  intro.enumerator = &intro_enumerator;
-  intro.good_size = &intro_good_size;
-  intro.check = &intro_check;
-  intro.print = &intro_print;
-  intro.log = &intro_log;
-  intro.force_lock = &intro_force_lock;
-  intro.force_unlock = &intro_force_unlock;
-
-  static malloc_zone_t oldzone;
-  static malloc_zone_t* zone;
-  zone = mi_get_default_zone(); // get the `malloc` backing default zone
-  if (zone == NULL) return;
-
-  // save the default zone in oldzone
-  memset(&oldzone, 0, sizeof(oldzone));
-  if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
-
-  if (zone->version >= 8) {
-    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
-               VM_PROT_READ|VM_PROT_WRITE);
-  }
-  // overwrite default zone functions in-place
-  zone->zone_name = "mimalloc";
-  zone->size = &zone_size;
-  zone->introspect = &intro;
-  zone->malloc = &zone_malloc;
-  zone->calloc = &zone_calloc;
-  zone->valloc = &zone_valloc;
-  zone->free = &zone_free;
-  zone->realloc = &zone_realloc;
-  zone->destroy = &zone_destroy;
-  zone->batch_malloc = &zone_batch_malloc;
-  zone->batch_free = &zone_batch_free;
-
-  malloc_zone_t* purgeable_zone = NULL;
-
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // switch to version 9 on OSX 10.6 to support memalign.
-  // zone->version = 9;
-  zone->memalign = &zone_memalign;
-  zone->free_definite_size = &zone_free_definite_size;
-  zone->pressure_relief = &zone_pressure_relief;
-  intro.zone_locked = &intro_zone_locked;
-  intro.statistics = &intro_statistics;
-  /*
-  // force the purgeable zone to exist to avoid strange bugs
-  if (malloc_default_purgeable_zone) {
-    purgeable_zone = malloc_default_purgeable_zone();
-  }
-  */
-#endif
-  if (zone->version >= 8) {
-    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
-               VM_PROT_READ);
-  }
-
-  /*
-  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
-  // earlier than the default zone.
-  if (purgeable_zone != NULL) {
-    malloc_zone_unregister(purgeable_zone);
-    malloc_zone_register(purgeable_zone);
-  }
-  */
-}
-
-#else
-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -326,6 +253,5 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
-#endif
 
 #endif // MI_MALLOC_OVERRIDE