From 42586de10437308293f5967cc4c6527c0d67a76c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 27 Jan 2020 23:13:57 -0800
Subject: [PATCH 01/62] fix is_zero setting in regions

---
 src/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory.c b/src/memory.c
index 96047b79..55122887 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -284,7 +284,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
   mi_assert_internal(!(info.x.is_large && !*is_large));
   mi_assert_internal(start != NULL);
 
-  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
+  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
   *is_large = info.x.is_large;
   *memid = mi_memid_create(region, bit_idx);
   void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);

From 9c166d88f0ca6ce5322856e58ac730972ca5404f Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 27 Jan 2020 23:15:24 -0800
Subject: [PATCH 02/62] increase retire page size

---
 src/page.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/page.c b/src/page.c
index 149926e8..28e5dfdb 100644
--- a/src/page.c
+++ b/src/page.c
@@ -386,6 +386,8 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
+#define MI_MAX_RETIRE_SIZE    (4*MI_SMALL_SIZE_MAX)
+
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
 // allocations might coming.
@@ -406,7 +408,7 @@ void _mi_page_retire(mi_page_t* page) {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if (mi_likely(page->xblock_size <= MI_SMALL_SIZE_MAX && !mi_page_is_in_full(page))) {
+  if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
       page->retire_expire = 16;
@@ -421,7 +423,7 @@ void _mi_page_retire(mi_page_t* page) {
 // free retired pages: we don't need to look at the entire queues
 // since we only retire pages that are the last one in a queue.
 void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
-  for(mi_page_queue_t* pq = heap->pages; pq->block_size <= MI_SMALL_SIZE_MAX; pq++) {
+  for(mi_page_queue_t* pq = heap->pages; pq->block_size <= MI_MAX_RETIRE_SIZE; pq++) {
     mi_page_t* page = pq->first;
     if (page != NULL && page->retire_expire != 0) {
       if (mi_page_all_free(page)) {

From 5d212d688f82a3b17f00faa11967e9459dc78715 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 29 Jan 2020 17:10:57 -0800
Subject: [PATCH 03/62] add MI_PADDING build option to add padding to each
 block to detect heap block overflows

---
 include/mimalloc-types.h    | 18 +++++++++++++----
 src/alloc.c                 | 40 +++++++++++++++++++++++++++++++++----
 test/main-override-static.c |  7 +++++++
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 48d86a25..39debae1 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -12,6 +12,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdint.h>   // uintptr_t, uint16_t, etc
 #include <mimalloc-atomic.h>  // _Atomic
 
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+
 // ------------------------------------------------------
 // Variants
 // ------------------------------------------------------
@@ -50,6 +54,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
+// Reserve extra padding at the end of each block; must be a multiple of `sizeof(intptr_t)`!
+// If free lists are encoded, the padding is checked if it was modified on free.
+#if (!defined(MI_PADDING)) 
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING  MI_MAX_ALIGN_SIZE
+#else
+#define MI_PADDING  0
+#endif
+#endif
+
 // ------------------------------------------------------
 // Platform specific values
 // ------------------------------------------------------
@@ -113,10 +127,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 #define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
 
-// Minimal alignment necessary. On most platforms 16 bytes are needed
-// due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
-#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
-
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
 
diff --git a/src/alloc.c b/src/alloc.c
index 3f577f2f..e4324d73 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -42,6 +42,11 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
     size_t bin = _mi_bin(size);
     mi_heap_stat_increase(heap,normal[bin], 1);
   }
+#endif
+#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
+  mi_assert_internal((MI_PADDING % sizeof(mi_block_t*)) == 0);
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING);
+  mi_block_set_nextx(page, padding, block, page->key[0], page->key[1]);
 #endif
   return block;
 }
@@ -54,6 +59,9 @@ extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size
 }
 
 extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
+#if (MI_PADDING>0)
+  size += MI_PADDING;
+#endif
   return mi_heap_malloc_small(mi_get_default_heap(), size);
 }
 
@@ -69,6 +77,9 @@ mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
 extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+#if (MI_PADDING>0)
+  size += MI_PADDING;
+#endif
   void* p;
   if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
     p = mi_heap_malloc_small(heap, size);
@@ -99,11 +110,11 @@ void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
   if (page->is_zero) {
     // already zero initialized memory?
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page)));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page) - MI_PADDING));
   }
   else {
     // otherwise memset
-    memset(p, 0, mi_page_block_size(page));
+    memset(p, 0, mi_page_block_size(page) - MI_PADDING);
   }
 }
 
@@ -171,6 +182,20 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 }
 #endif
 
+#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING);
+  mi_block_t* const decoded = mi_block_nextx(page, padding, page->key[0], page->key[1]);
+  if (decoded != block) {
+    _mi_error_message(EINVAL, "buffer overflow in heap block %p: write after %zu bytes\n", block, page->xblock_size);
+  }
+}
+#else 
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(page);
+  UNUSED(block);
+}
+#endif
 
 // ------------------------------------------------------
 // Free
@@ -214,6 +239,8 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     return;
   }
 
+  mi_check_padding(page, block);
+
   mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
   bool use_delayed;
@@ -258,13 +285,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
   #if (MI_DEBUG)
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page) - MI_PADDING);
   #endif
 
   // and push it on the free list
   if (mi_likely(local)) {
     // owning thread can free a block directly
     if (mi_unlikely(mi_check_is_double_free(page, block))) return;
+    mi_check_padding(page, block);
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
@@ -341,6 +369,7 @@ void mi_free(void* p) mi_attr_noexcept
     // local, and not full or aligned
     mi_block_t* const block = (mi_block_t*)p;
     if (mi_unlikely(mi_check_is_double_free(page,block))) return;
+    mi_check_padding(page, block);
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
@@ -381,8 +410,11 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 size_t mi_usable_size(const void* p) mi_attr_noexcept {
   if (p==NULL) return 0;
   const mi_segment_t* segment = _mi_ptr_segment(p);
-  const mi_page_t* page = _mi_segment_page_of(segment,p);
+  const mi_page_t* page = _mi_segment_page_of(segment, p);
   size_t size = mi_page_block_size(page);
+#if defined(MI_PADDING)
+  size -= MI_PADDING;
+#endif
   if (mi_unlikely(mi_page_has_aligned(page))) {
     ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 54a5ea66..a1c3edee 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -10,6 +10,7 @@
 static void double_free1();
 static void double_free2();
 static void corrupt_free();
+static void block_overflow1();
 
 int main() {
   mi_version();
@@ -18,6 +19,7 @@ int main() {
   // double_free1();
   // double_free2();
   // corrupt_free();
+  // block_overflow1();
 
   void* p1 = malloc(78);
   void* p2 = malloc(24);
@@ -41,6 +43,11 @@ int main() {
   return 0;
 }
 
+static void block_overflow1() {
+  void* p = mi_malloc(16);
+  memset(p, 0, 17);
+  free(p);
+}
 
 // The double free samples come ArcHeap [1] by Insu Yun (issue #161)
 // [1]: https://arxiv.org/pdf/1903.00503.pdf

From 7ff3ec2bf74b9014279103a55b632df182dacc7c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 29 Jan 2020 17:25:40 -0800
Subject: [PATCH 04/62] use EFAULT for buffer overflow and call abort in debug
 mode (as well as secure mode)

---
 src/alloc.c   | 2 +-
 src/options.c | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/alloc.c b/src/alloc.c
index e4324d73..6852d652 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -187,7 +187,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING);
   mi_block_t* const decoded = mi_block_nextx(page, padding, page->key[0], page->key[1]);
   if (decoded != block) {
-    _mi_error_message(EINVAL, "buffer overflow in heap block %p: write after %zu bytes\n", block, page->xblock_size);
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p: write after %zu bytes\n", block, page->xblock_size);
   }
 }
 #else 
diff --git a/src/options.c b/src/options.c
index af051aa2..7559a4b5 100644
--- a/src/options.c
+++ b/src/options.c
@@ -319,6 +319,14 @@ static volatile _Atomic(void*) mi_error_arg;     // = NULL
 
 static void mi_error_default(int err) {
   UNUSED(err);
+#if (MI_DEBUG>0) 
+  if (err==EFAULT) {
+    #ifdef _MSC_VER
+    __debugbreak();
+    #endif
+    abort();
+  }
+#endif
 #if (MI_SECURE>0)
   if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
     abort();

From 03b363a1c289ad4461c219050466a9f7de0b8432 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Wed, 29 Jan 2020 22:46:44 -0800
Subject: [PATCH 05/62] first working tls on macOS using interpose; still slow

---
 CMakeLists.txt              |  2 +-
 include/mimalloc-internal.h | 38 +++++++++++-------
 src/alloc-override.c        |  7 +++-
 src/alloc.c                 |  2 +-
 src/init.c                  | 62 ++++++++++++++++------------
 src/options.c               | 32 ++++++++++-----
 src/random.c                | 34 ++++++++--------
 src/segment.c               | 80 ++++++++++++++++++-------------------
 test/test-stress.c          | 18 ++++-----
 9 files changed, 155 insertions(+), 120 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b60e64a4..2da7974b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,7 +247,7 @@ if (MI_BUILD_TESTS MATCHES "ON")
   target_compile_definitions(mimalloc-test-stress PRIVATE ${mi_defines})
   target_compile_options(mimalloc-test-stress PRIVATE ${mi_cflags})
   target_include_directories(mimalloc-test-stress PRIVATE include)
-  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc-static ${mi_libraries})
+  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc ${mi_libraries})
 
   enable_testing()
   add_test(test_api, mimalloc-test-api)
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index c7d7a1da..f4b578f6 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -33,7 +33,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align     
+#define mi_decl_cache_align
 #endif
 
 
@@ -51,6 +51,7 @@ void       _mi_random_init(mi_random_ctx_t* ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
@@ -233,7 +234,7 @@ static inline size_t _mi_wsize_from_size(size_t size) {
 
 
 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {  
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
 #if (SIZE_MAX == UINT_MAX)
@@ -274,18 +275,24 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 
-static inline mi_heap_t* mi_get_default_heap(void) {
 #ifdef MI_TLS_RECURSE_GUARD
+extern mi_heap_t* _mi_get_default_heap_tls_safe(void);
+static inline mi_heap_t* mi_get_default_heap(void) {
   // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
   // to initialize thread local data. To avoid recursion, we need to avoid
   // accessing the thread local `_mi_default_heap` until our module is loaded
   // and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  if (!_mi_process_is_initialized) return &_mi_heap_main;
-#endif
+  return _mi_get_default_heap_tls_safe();
+#else
+
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
   return _mi_heap_default;
+
+#endif
 }
 
 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
@@ -302,6 +309,7 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }
 
 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  mi_assert_internal(_mi_heap_main.cookie != 0);
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
 
@@ -345,7 +353,7 @@ static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, con
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
   return &((mi_segment_t*)segment)->pages[idx];
 }
 
@@ -411,14 +419,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
   return mi_tf_make(block, mi_tf_delayed(tf));
 }
 
-// are all blocks in a page freed? 
+// are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
 static inline bool mi_page_all_free(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
   return (page->used == 0);
 }
 
-// are there any available blocks? 
+// are there any available blocks?
 static inline bool mi_page_has_any_available(const mi_page_t* page) {
   mi_assert_internal(page != NULL && page->reserved > 0);
   return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
@@ -466,11 +474,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers
 
-This is to protect against buffer overflow exploits where the 
-free list is mutated. Many hardened allocators xor the next pointer `p` 
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
 with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess 
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
 Moreover, if multiple blocks can be read as well, the attacker can
 xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
 about the pointers (and subsequently `k1`).
@@ -478,9 +486,9 @@ about the pointers (and subsequently `k1`).
 Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
 Since these operations are not associative, the above approaches do not
 work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term, 
+for the read case we can subtract two entries to discard the `+k1` term,
 but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear 
+We include the left-rotation since xor and addition are otherwise linear
 in the lowest bit. Finally, both keys are unique per page which reduces
 the re-use of keys by a large factor.
 
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 89c5126a..58996c5f 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -41,6 +41,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+  static void mi_free_tls_safe(void* p) {
+    if (mi_unlikely(_mi_preloading())) return;
+    mi_free(p);
+  }
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
@@ -54,7 +58,7 @@ terms of the MIT license. A copy of the license can be found in the file
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSE_MI(free),
+    MI_INTERPOSEX(free,mi_free_tls_safe),
     MI_INTERPOSE_MI(strdup),
     MI_INTERPOSE_MI(strndup)
   };
@@ -194,4 +198,3 @@ int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_me
 #endif
 
 #endif // MI_MALLOC_OVERRIDE && !_WIN32
-
diff --git a/src/alloc.c b/src/alloc.c
index 3f577f2f..d60c33bf 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { 
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* block = page->free;
   if (mi_unlikely(block == NULL)) {
diff --git a/src/init.c b/src/init.c
index f8411187..922b7438 100644
--- a/src/init.c
+++ b/src/init.c
@@ -104,9 +104,9 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, 
-    0, 0, 0, 0, 0, 0, NULL, 
-    tld_main_stats, tld_main_os 
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, 0, NULL,
+    tld_main_stats, tld_main_os
   }, // segments
   { 0, tld_main_stats },  // os
   { MI_STATS_NULL }       // stats
@@ -124,9 +124,9 @@ mi_heap_t _mi_heap_main = {
   MI_PAGE_QUEUES_EMPTY,
   ATOMIC_VAR_INIT(NULL),
   0,                // thread id
-  MI_INIT_COOKIE,   // initial cookie
-  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0}, {0}, 0 },  // random
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
   0,                // page count
   false             // can reclaim
 };
@@ -148,14 +148,15 @@ typedef struct mi_thread_data_s {
 
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
+    mi_assert_internal(_mi_heap_main.thread_id != 0);
     // the main heap is statically allocated
     _mi_heap_set_default_direct(&_mi_heap_main);
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
   }
   else {
-    // use `_mi_os_alloc` to allocate directly from the OS    
+    // use `_mi_os_alloc` to allocate directly from the OS
     mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
     if (td == NULL) {
       _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
@@ -170,7 +171,7 @@ static bool _mi_heap_init(void) {
     heap->cookie = _mi_heap_random_next(heap) | 1;
     heap->key[0] = _mi_heap_random_next(heap);
     heap->key[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
@@ -265,8 +266,9 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #endif
 
 // Set up handlers so `mi_thread_done` is called automatically
+static bool tls_initialized = false; // fine if it races
+
 static void mi_process_setup_auto_thread_done(void) {
-  static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -317,7 +319,9 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
+  #ifndef MI_TLS_RECURSE_GUARD
   _mi_heap_default = heap;
+  #endif
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
@@ -330,7 +334,11 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #endif
 }
 
-
+mi_heap_t* _mi_get_default_heap_tls_safe(void) {
+  if (mi_unlikely(mi_pthread_key==0)) return (mi_heap_t*)&_mi_heap_empty;
+  mi_heap_t* heap = pthread_getspecific(mi_pthread_key);
+  return (mi_likely(heap!=NULL) ? heap : (mi_heap_t*)&_mi_heap_empty);
+}
 
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@@ -339,6 +347,7 @@ static void mi_process_done(void);
 
 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+bool _mi_tls_initialized = false;
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool _mi_preloading() {
@@ -383,7 +392,10 @@ static void mi_allocator_done() {
 
 // Called once by the process loader
 static void mi_process_load(void) {
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  UNUSED(dummy);
   os_preloading = false;
+  _mi_tls_initialized = true;
   atexit(&mi_process_done);
   _mi_options_init();
   mi_process_init();
@@ -398,26 +410,26 @@ static void mi_process_load(void) {
   }
 }
 
+void _mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&_mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
   if (_mi_process_is_initialized) return;
-  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
-  // that the TLS slot is allocated without getting into recursion on macOS
-  // when using dynamic linking with interpose.
-  mi_get_default_heap();
   _mi_process_is_initialized = true;
-
-  _mi_heap_main.thread_id = _mi_thread_id();
-  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
-  _mi_random_init(&_mi_heap_main.random);
-  #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
-  #endif
   mi_process_setup_auto_thread_done();
+
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   _mi_os_init();
+  _mi_heap_main_init();
   #if (MI_DEBUG)
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
diff --git a/src/options.c b/src/options.c
index af051aa2..c0bf9680 100644
--- a/src/options.c
+++ b/src/options.c
@@ -53,7 +53,7 @@ static mi_option_desc_t options[_mi_option_last] =
   // stable options
   { MI_DEBUG, UNINIT, MI_OPTION(show_errors) },
   { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
+  { 1, UNINIT, MI_OPTION(verbose) },
 
   // the following options are experimental and not all combinations make sense.
   { 1, UNINIT, MI_OPTION(eager_commit) },        // commit on demand
@@ -239,16 +239,30 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;
 
+static bool mi_recurse_enter(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return true;
+  #endif
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static void mi_recurse_exit(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return;
+  #endif
+  recurse = false;
+}
+
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (recurse) return;
+  if (!mi_recurse_enter()) return;
   if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
     out = mi_out_get_default(&arg);
   }
-  recurse = true;
   if (prefix != NULL) out(prefix,arg);
   out(message,arg);
-  recurse = false;
-  return;
+  mi_recurse_exit();
 }
 
 // Define our own limited `fprintf` that avoids memory allocation.
@@ -256,14 +270,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
-  if (recurse) return;
-  recurse = true;
+  if (!mi_recurse_enter()) return;
   vsnprintf(buf,sizeof(buf)-1,fmt,args);
-  recurse = false;
+  mi_recurse_exit();
   _mi_fputs(out,arg,prefix,buf);
 }
 
-
 void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
   va_list args;
   va_start(args,fmt);
@@ -290,7 +302,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 static void mi_show_error_message(const char* fmt, va_list args) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
   if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }
 
 void _mi_warning_message(const char* fmt, ...) {
diff --git a/src/random.c b/src/random.c
index 6fef2434..b3dbf4f8 100644
--- a/src/random.c
+++ b/src/random.c
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
-and to avoid implementations that use a lock. We only use the OS provided 
+and to avoid implementations that use a lock. We only use the OS provided
 random source to initialize the initial seeds. Since we do not need ultimate
 performance but we do rely on the security (for secret cookies in secure mode)
 we use a cryptographically secure generator (chacha20).
@@ -21,11 +21,11 @@ we use a cryptographically secure generator (chacha20).
 
 
 /* ----------------------------------------------------------------------------
-Chacha20 implementation as the original algorithm with a 64-bit nonce 
+Chacha20 implementation as the original algorithm with a 64-bit nonce
 and counter: https://en.wikipedia.org/wiki/Salsa20
 The input matrix has sixteen 32-bit values:
 Position  0 to  3: constant key
-Position  4 to 11: the key 
+Position  4 to 11: the key
 Position 12 to 13: the counter.
 Position 14 to 15: the nonce.
 
@@ -44,8 +44,8 @@ static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d
   x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
 }
 
-static void chacha_block(mi_random_ctx_t* ctx) 
-{  
+static void chacha_block(mi_random_ctx_t* ctx)
+{
   // scramble into `x`
   uint32_t x[16];
   for (size_t i = 0; i < 16; i++) {
@@ -72,8 +72,8 @@ static void chacha_block(mi_random_ctx_t* ctx)
   ctx->input[12] += 1;
   if (ctx->input[12] == 0) {
     ctx->input[13] += 1;
-    if (ctx->input[13] == 0) {  // and keep increasing into the nonce 
-      ctx->input[14] += 1;  
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
     }
   }
 }
@@ -83,7 +83,7 @@ static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
     chacha_block(ctx);
     ctx->output_available = 16; // (assign again to suppress static analysis warning)
   }
-  const uint32_t x = ctx->output[16 - ctx->output_available];  
+  const uint32_t x = ctx->output[16 - ctx->output_available];
   ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
   ctx->output_available--;
   return x;
@@ -94,9 +94,9 @@ static inline uint32_t read32(const uint8_t* p, size_t idx32) {
   return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
 }
 
-static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) 
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
 {
-  // since we only use chacha for randomness (and not encryption) we 
+  // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
   memset(ctx, 0, sizeof(*ctx));
@@ -110,7 +110,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   ctx->input[12] = 0;
   ctx->input[13] = 0;
   ctx->input[14] = (uint32_t)nonce;
-  ctx->input[15] = (uint32_t)(nonce >> 32);  
+  ctx->input[15] = (uint32_t)(nonce >> 32);
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
@@ -184,7 +184,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
   arc4random_buf(buf, buf_len);
   return true;
 }
-#elif defined(__linux__) 
+#elif defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
@@ -241,8 +241,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif
 
-static uintptr_t os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
   #if defined(_WIN32)
     LARGE_INTEGER pcount;
     QueryPerformanceCounter(&pcount);
@@ -267,10 +267,10 @@ static uintptr_t os_random_weak(uintptr_t extra_seed) {
 void _mi_random_init(mi_random_ctx_t* ctx) {
   uint8_t key[32];
   if (!os_random_buf(key, sizeof(key))) {
-    // if we fail to get random data from the OS, we fall back to a 
+    // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
     _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = os_random_weak(0);
+    uintptr_t x = _os_random_weak(0);
     for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
@@ -280,7 +280,7 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
 }
 
 /* --------------------------------------------------------
-test vectors from <https://tools.ietf.org/html/rfc8439> 
+test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
 /*
 static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
diff --git a/src/segment.c b/src/segment.c
index c7a9662b..0e70c3bf 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -17,9 +17,9 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 
 /* --------------------------------------------------------------------------------
   Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid 
-  splitting VMA's on Linux and reduce fragmentation on other OS's. 
-  Each thread owns its own segments. 
+  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.
 
   Currently we have:
   - small pages (64kb), 64 in one segment
@@ -154,14 +154,14 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
     if (!page->segment_in_use) {
-      nfree++;      
+      nfree++;
     }
     if (page->segment_in_use || page->is_reset) {
       mi_assert_expensive(!mi_pages_reset_contains(page, tld));
     }
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
-  mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
   mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
                      (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
   return true;
@@ -286,7 +286,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
   mi_assert_expensive(!mi_pages_reset_contains(page, tld));
   mi_assert_internal(_mi_page_segment(page)==segment);
   if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;  
+  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;
 
   if (mi_option_get(mi_option_reset_delay) == 0) {
     // reset immediately?
@@ -295,7 +295,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
   else {
     // otherwise push on the delayed page reset queue
     mi_page_queue_t* pq = &tld->pages_reset;
-    // push on top 
+    // push on top
     mi_page_reset_set_expire(page);
     page->next = pq->first;
     page->prev = NULL;
@@ -316,7 +316,7 @@ static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_page_queue_t* pq = &tld->pages_reset;
   mi_assert_internal(pq!=NULL);
   mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_pages_reset_contains(page, tld));  
+  mi_assert_internal(mi_pages_reset_contains(page, tld));
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == pq->last)  pq->last = page->prev;
@@ -332,19 +332,19 @@ static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool for
     if (!page->segment_in_use && page->is_committed && !page->is_reset) {
       mi_pages_reset_remove(page, tld);
       if (force_reset) {
-        mi_page_reset(segment, page, 0, tld); 
+        mi_page_reset(segment, page, 0, tld);
       }
     }
     else {
       mi_assert_internal(mi_page_not_in_queue(page,tld));
-    }    
+    }
   }
 }
 
 static void mi_reset_delayed(mi_segments_tld_t* tld) {
   if (!mi_option_is_enabled(mi_option_page_reset)) return;
   mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_reset;  
+  mi_page_queue_t* pq = &tld->pages_reset;
   // from oldest up to the first that has not expired yet
   mi_page_t* page = pq->last;
   while (page != NULL && mi_page_reset_is_expired(page,now)) {
@@ -358,7 +358,7 @@ static void mi_reset_delayed(mi_segments_tld_t* tld) {
   pq->last = page;
   if (page != NULL){
     page->next = NULL;
-  } 
+  }
   else {
     pq->first = NULL;
   }
@@ -540,7 +540,7 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
   }
   mi_assert_internal(tld->cache_count == 0);
   mi_assert_internal(tld->cache == NULL);
-#if MI_DEBUG>=2 
+#if MI_DEBUG>=2
   if (!_mi_is_main_thread()) {
     mi_assert_internal(tld->pages_reset.first == NULL);
     mi_assert_internal(tld->pages_reset.last == NULL);
@@ -684,7 +684,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
 
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
   UNUSED(force);
-  mi_assert(segment != NULL);  
+  mi_assert(segment != NULL);
   // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
   bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
   mi_pages_reset_remove_all_in_segment(segment, force_reset, tld);
@@ -716,7 +716,7 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {
 
 static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(_mi_page_segment(page) == segment);
-  mi_assert_internal(!page->segment_in_use);    
+  mi_assert_internal(!page->segment_in_use);
   // set in-use before doing unreset to prevent delayed reset
   mi_pages_reset_remove(page, tld);
   page->segment_in_use = true;
@@ -756,7 +756,7 @@ static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
 
 // clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld) 
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld)
 {
   mi_assert_internal(page->segment_in_use);
   mi_assert_internal(mi_page_all_free(page));
@@ -787,7 +787,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   segment->used--;
 
   // add to the free page list for reuse/reset
-  if (allow_reset) {  
+  if (allow_reset) {
     mi_pages_reset_add(segment, page, tld);
   }
 }
@@ -841,12 +841,12 @@ Note: the current implementation is one possible design;
 another way might be to keep track of abandoned segments
 in the regions. This would have the advantage of keeping
 all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to 
-scan abandoned segments efficiently in that case as they 
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
 would be spread among all other segments in the regions.
 ----------------------------------------------------------- */
 
-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers 
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
 // to put in a tag that increments on update to avoid the A-B-A problem.
 #define MI_TAGGED_MASK   MI_SEGMENT_MASK
 typedef uintptr_t        mi_tagged_segment_t;
@@ -862,7 +862,7 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 }
 
 // This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of 
+// this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
 static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
 
@@ -888,7 +888,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 }
 
 // Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void) 
+static bool mi_abandoned_visited_revisit(void)
 {
   // quick check if the visited list is empty
   if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
@@ -954,12 +954,12 @@ static mi_segment_t* mi_abandoned_pop(void) {
   segment = mi_tagged_segment_ptr(ts);
   if (mi_likely(segment == NULL)) {
     if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;  
+      return NULL;
     }
   }
 
   // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending, 
+  // a segment to be decommitted while a read is still pending,
   // and a tagged pointer to prevent A-B-A link corruption.
   // (this is called from `memory.c:_mi_mem_free` for example)
   mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
@@ -1024,7 +1024,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 ----------------------------------------------------------- */
 
 // Possibly clear pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free) 
+static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   bool has_page = false;
@@ -1032,17 +1032,17 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
   size_t pages_used_empty = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {      
+    if (page->segment_in_use) {
       pages_used++;
       // ensure used count is up to date and collect potential concurrent frees
-      _mi_page_free_collect(page, false); 
+      _mi_page_free_collect(page, false);
       if (mi_page_all_free(page)) {
         // if everything free already, page can be reused for some block size
         // note: don't clear the page yet as we can only OS reset it once it is reclaimed
         pages_used_empty++;
         has_page = true;
       }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {  
+      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
         has_page = true;
       }
@@ -1051,7 +1051,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
       // whole empty page
       has_page = true;
     }
-  }  
+  }
   mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
   if (all_pages_free != NULL) {
     *all_pages_free = ((pages_used - pages_used_empty) == 0);
@@ -1100,7 +1100,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
-    }   
+    }
     else if (page->is_committed && !page->is_reset) {  // not in-use, and not reset yet
       // note: do not reset as this includes pages that were not touched before
       // mi_pages_reset_add(segment, page, tld);
@@ -1141,17 +1141,17 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
       // free the segment (by forced reclaim) to make it available to other threads.
       // note1: we prefer to free a segment as that might lead to reclaiming another
       // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly 
+      // note2: we could in principle optimize this by skipping reclaim and directly
       // freeing but that would violate some invariants temporarily)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else if (has_page && segment->page_kind == page_kind) {
-      // found a free page of the right kind, or page of the right block_size with free space 
+      // found a free page of the right kind, or page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
       return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
     }
-    else if (segment->abandoned_visits >= 3) {  
+    else if (segment->abandoned_visits >= 3) {
       // always reclaim on 3rd visit to limit the list length.
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
@@ -1165,12 +1165,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 
 
 /* -----------------------------------------------------------
-   Reclaim or allocate  
+   Reclaim or allocate
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  mi_assert_internal(page_kind <= MI_PAGE_LARGE);  
+  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   // 1. try to get a segment from our cache
   mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
@@ -1191,7 +1191,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
     return segment;
   }
   // 3. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);  
+  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
 }
 
 
@@ -1216,11 +1216,11 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t*
 // Allocate a page inside a segment. Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
-  return mi_segment_find_free(segment, tld);  
+  return mi_segment_find_free(segment, tld);
 }
 
 static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  // find an available segment the segment free queue 
+  // find an available segment the segment free queue
   mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
   if (mi_segment_queue_is_empty(free_queue)) {
     // possibly allocate or reclaim a fresh segment
@@ -1275,7 +1275,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
 }
 
 /* -----------------------------------------------------------
-   Page allocation 
+   Page allocation
 ----------------------------------------------------------- */
 
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
diff --git a/test/test-stress.c b/test/test-stress.c
index 1b559a59..8958933e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -20,7 +20,7 @@ terms of the MIT license.
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
-#include <mimalloc.h>
+// #include <mimalloc.h>
 
 // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
 //
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifdef USE_STD_MALLOC
+#ifndef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)
@@ -188,7 +188,7 @@ static void test_stress(void) {
         free_items(p);
       }
     }
-    mi_collect(false);
+    // mi_collect(false);
 #ifndef NDEBUG
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
@@ -206,7 +206,7 @@ static void leak(intptr_t tid) {
   }
 }
 
-static void test_leak(void) {  
+static void test_leak(void) {
   for (int n = 0; n < ITER; n++) {
     run_os_threads(THREADS, &leak);
     mi_collect(false);
@@ -242,15 +242,15 @@ int main(int argc, char** argv) {
 
   // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
   srand(0x7feb352d);
-  mi_stats_reset();
+  // mi_stats_reset();
 #ifdef STRESS
     test_stress();
 #else
     test_leak();
-#endif  
+#endif
 
-  mi_collect(true);
-  mi_stats_print(NULL);
+  // mi_collect(true);
+  // mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }
@@ -262,7 +262,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;
 
 #include <windows.h>
 
-static DWORD WINAPI thread_entry(LPVOID param) {  
+static DWORD WINAPI thread_entry(LPVOID param) {
   thread_entry_fun((intptr_t)param);
   return 0;
 }

From ed1c8a203ab0ce9df97919767d01bc3f180ec2f1 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Wed, 29 Jan 2020 23:08:12 -0800
Subject: [PATCH 06/62] improve performance with tls recursion counter

---
 include/mimalloc-internal.h | 19 +++++++++++--------
 src/init.c                  | 23 +++++++++++++++--------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index f4b578f6..b2e57aec 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -275,24 +275,27 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 
 #ifdef MI_TLS_RECURSE_GUARD
 extern mi_heap_t* _mi_get_default_heap_tls_safe(void);
+extern size_t _mi_tls_recurse;
+#endif
+
 static inline mi_heap_t* mi_get_default_heap(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_tls_recurse++>100) {
   // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
   // to initialize thread local data. To avoid recursion, we need to avoid
   // accessing the thread local `_mi_default_heap` until our module is loaded
   // and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  return _mi_get_default_heap_tls_safe();
-#else
-
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-
-static inline mi_heap_t* mi_get_default_heap(void) {
+    mi_heap_t* heap = _mi_get_default_heap_tls_safe();
+    _mi_tls_recurse = 0;
+    return heap;
+  }
+  #endif
   return _mi_heap_default;
-
-#endif
 }
 
 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 922b7438..750be169 100644
--- a/src/init.c
+++ b/src/init.c
@@ -266,9 +266,8 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #endif
 
 // Set up handlers so `mi_thread_done` is called automatically
-static bool tls_initialized = false; // fine if it races
-
 static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -319,9 +318,6 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #ifndef MI_TLS_RECURSE_GUARD
-  _mi_heap_default = heap;
-  #endif
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
@@ -332,8 +328,18 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #elif defined(MI_USE_PTHREADS)
     pthread_setspecific(mi_pthread_key, heap);
   #endif
+  if (_mi_tls_recurse < 100) {
+    _mi_heap_default = heap;
+  }
 }
 
+#ifdef MI_TLS_RECURSE_GUARD
+// initialize high so the first call uses safe TLS
+size_t _mi_tls_recurse = 10000;
+#else
+size_t _mi_tls_recurse = 0;
+#endif
+
 mi_heap_t* _mi_get_default_heap_tls_safe(void) {
   if (mi_unlikely(mi_pthread_key==0)) return (mi_heap_t*)&_mi_heap_empty;
   mi_heap_t* heap = pthread_getspecific(mi_pthread_key);
@@ -347,7 +353,6 @@ static void mi_process_done(void);
 
 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
-bool _mi_tls_initialized = false;
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool _mi_preloading() {
@@ -395,7 +400,7 @@ static void mi_process_load(void) {
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
   os_preloading = false;
-  _mi_tls_initialized = true;
+  _mi_heap_set_default_direct(&_mi_heap_main);
   atexit(&mi_process_done);
   _mi_options_init();
   mi_process_init();
@@ -414,7 +419,9 @@ void _mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
     _mi_heap_main.thread_id = _mi_thread_id();
     _mi_heap_main.cookie = _os_random_weak((uintptr_t)&_mi_heap_main_init);
-    _mi_random_init(&_mi_heap_main.random);
+  }
+  if (_mi_tls_recurse < 100) {
+     _mi_random_init(&_mi_heap_main.random);
     _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
   }

From a7c69ccbeaa92fe792fe4ff6c11e79076ed3aa5d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 30 Jan 2020 06:25:42 -0800
Subject: [PATCH 07/62] fix stat accounting of segments with huge blocks

---
 include/mimalloc-internal.h |  1 +
 src/alloc.c                 | 29 +----------------------------
 src/segment.c               | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 6fca06b8..f18e459b 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -81,6 +81,7 @@ void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
 uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
diff --git a/src/alloc.c b/src/alloc.c
index 3f577f2f..990bcf8b 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -176,33 +176,6 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 // Free
 // ------------------------------------------------------
 
-// free huge block from another thread
-static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  // huge page segments are always abandoned and can be freed immediately
-  mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
-
-  // claim it and free
-  mi_heap_t* heap = mi_get_default_heap();
-  // paranoia: if this it the last reference, the cas should always succeed
-  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
-    mi_block_set_next(page, block, page->free);
-    page->free = block;
-    page->used--;
-    page->is_zero = false;
-    mi_assert(page->used == 0);
-    mi_tld_t* tld = heap->tld;
-    const size_t bsize = mi_page_block_size(page);
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      _mi_stat_decrease(&tld->stats.giant, bsize);
-    }
-    else {
-      _mi_stat_decrease(&tld->stats.huge, bsize);
-    }
-    _mi_segment_page_free(page, true, &tld->segments);
-  }
-}
 
 // multi-threaded free
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
@@ -210,7 +183,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   // huge page segments are always abandoned and can be freed immediately
   mi_segment_t* segment = _mi_page_segment(page);
   if (segment->page_kind==MI_PAGE_HUGE) {
-    mi_free_huge_block_mt(segment, page, block);
+    _mi_segment_huge_page_free(segment, page, block);
     return;
   }
 
diff --git a/src/segment.c b/src/segment.c
index 3914d770..25941354 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -461,7 +461,6 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
   if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
 }
 
-
 static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
   segment->thread_id = 0;
   mi_segments_track_size(-((long)segment_size),tld);
@@ -1039,11 +1038,41 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
   if (segment == NULL) return NULL;
   mi_assert_internal(mi_segment_page_size(segment) - segment->segment_info_size - (2*(MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= size);
   segment->thread_id = 0; // huge pages are immediately abandoned
+  mi_segments_track_size(-(long)segment->segment_size, tld);
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
   return page;
 }
 
+// free huge block from another thread
+void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately by any thread
+  mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_get_default_heap();
+  // paranoia: if this it the last reference, the cas should always succeed
+  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_segments_tld_t* tld = &heap->tld->segments;
+    const size_t bsize = mi_page_block_size(page);
+    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&tld->stats->giant, bsize); 
+    }
+    else {
+      _mi_stat_decrease(&tld->stats->huge, bsize);
+    }
+    mi_segments_track_size((long)segment->segment_size, tld);
+    _mi_segment_page_free(page, true, tld);
+  }
+}
+
 /* -----------------------------------------------------------
    Page allocation and free
 ----------------------------------------------------------- */

From 4531367de2bf551d5912bb612fd6b0c59a5bf849 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 31 Jan 2020 13:20:02 -0800
Subject: [PATCH 08/62] fix padding check for aligned allocation; improve perf
 for small aligned allocations

---
 include/mimalloc-types.h | 15 ++++++----
 src/alloc-aligned.c      |  8 ++++--
 src/alloc-posix.c        | 13 ++++++---
 src/alloc.c              | 60 +++++++++++++++++++---------------------
 src/options.c            |  4 +--
 5 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 39debae1..9cda377e 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -54,16 +54,19 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
-// Reserve extra padding at the end of each block; must be a multiple of `sizeof(intptr_t)`!
+// Reserve extra padding at the end of each block; must be a multiple of `2*sizeof(intptr_t)`!
 // If free lists are encoded, the padding is checked if it was modified on free.
-#if (!defined(MI_PADDING)) 
-#if (MI_SECURE>=3 || MI_DEBUG>=1)
-#define MI_PADDING  MI_MAX_ALIGN_SIZE
+#if (!defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1))
+#define MI_PADDING    
+#endif
+
+#if defined(MI_PADDING)
+#define MI_PADDING_SIZE  (2*sizeof(intptr_t))
 #else
-#define MI_PADDING  0
-#endif
+#define MI_PADDING_SIZE  0
 #endif
 
+
 // ------------------------------------------------------
 // Platform specific values
 // ------------------------------------------------------
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 55b0e041..3749fbc6 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -18,20 +18,22 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   // note: we don't require `size > offset`, we just guarantee that
   // the address at offset is aligned regardless of the allocated size.
   mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
+
+  if (alignment <= MI_MAX_ALIGN_SIZE && offset==0) return _mi_heap_malloc_zero(heap, size, zero);
   if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
   if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+  if (mi_likely(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE))) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
     const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
     if (mi_likely(page->free != NULL && is_aligned))
     {
       #if MI_STAT>1
       mi_heap_stat_increase( heap, malloc, size);
       #endif
-      void* p = _mi_page_malloc(heap,page,size); // TODO: inline _mi_page_malloc
+      void* p = _mi_page_malloc(heap,page,size + MI_PADDING_SIZE); // TODO: inline _mi_page_malloc
       mi_assert_internal(p != NULL);
       mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
       if (zero) _mi_block_zero_init(page,p,size);
diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index 505e42e4..ade8cc48 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -47,16 +47,19 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
   // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
   if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;      // natural alignment
+  if (alignment % sizeof(void*) != 0) return EINVAL;   // natural alignment
   if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
-  void* q = mi_malloc_aligned(size, alignment);
+  void* q = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
   if (q==NULL && size != 0) return ENOMEM;
+  mi_assert_internal(((uintptr_t)q % alignment) == 0);
   *p = q;
   return 0;
 }
 
 void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
-  return mi_malloc_aligned(size, alignment);
+  void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
 }
 
 void* mi_valloc(size_t size) mi_attr_noexcept {
@@ -73,7 +76,9 @@ void* mi_pvalloc(size_t size) mi_attr_noexcept {
 void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
   if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-  return mi_malloc_aligned(size, alignment);
+  void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
 }
 
 void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
diff --git a/src/alloc.c b/src/alloc.c
index 6852d652..34e65765 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -43,9 +43,9 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
     mi_heap_stat_increase(heap,normal[bin], 1);
   }
 #endif
-#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
-  mi_assert_internal((MI_PADDING % sizeof(mi_block_t*)) == 0);
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING);
+#if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
+  mi_assert_internal((MI_PADDING_SIZE % sizeof(mi_block_t*)) == 0);
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING_SIZE);
   mi_block_set_nextx(page, padding, block, page->key[0], page->key[1]);
 #endif
   return block;
@@ -53,39 +53,27 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 
 // allocate a small block
 extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(size <= MI_SMALL_SIZE_MAX);
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
-  return _mi_page_malloc(heap, page, size);
+  mi_assert(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE));
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
+  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
+  mi_assert_internal(p==NULL || mi_page_block_size(_mi_ptr_page(p)) >= (size + MI_PADDING_SIZE));
+  return p;
 }
 
 extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
-#if (MI_PADDING>0)
-  size += MI_PADDING;
-#endif
   return mi_heap_malloc_small(mi_get_default_heap(), size);
 }
 
-
-// zero initialized small block
-mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  void* p = mi_malloc_small(size);
-  if (p != NULL) { memset(p, 0, size); }
-  return p;
-}
-
 // The main allocation function
 extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-#if (MI_PADDING>0)
-  size += MI_PADDING;
-#endif
   void* p;
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+  if (mi_likely(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE))) {
     p = mi_heap_malloc_small(heap, size);
   }
   else {
-    p = _mi_malloc_generic(heap, size);
+    p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);
   }
   #if MI_STAT>1
   if (p != NULL) {
@@ -93,6 +81,7 @@ extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t siz
     mi_heap_stat_increase( heap, malloc, mi_good_size(size) );  // overestimate for aligned sizes
   }
   #endif
+  mi_assert_internal(p == NULL || mi_page_block_size(_mi_ptr_page(p)) >= (size + MI_PADDING_SIZE));
   return p;
 }
 
@@ -100,24 +89,34 @@ extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
   return mi_heap_malloc(mi_get_default_heap(), size);
 }
 
+
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
   // note: we need to initialize the whole block to zero, not just size
   // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
   UNUSED_RELEASE(size);
   mi_assert_internal(p != NULL);
-  mi_assert_internal(mi_page_block_size(page) >= size); // size can be zero
+  mi_assert_internal(mi_page_block_size(page) >= (size + MI_PADDING_SIZE)); // size can be zero
   mi_assert_internal(_mi_ptr_page(p)==page);
   if (page->is_zero) {
     // already zero initialized memory?
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page) - MI_PADDING));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page) - MI_PADDING_SIZE));
   }
   else {
     // otherwise memset
-    memset(p, 0, mi_page_block_size(page) - MI_PADDING);
+    memset(p, 0, mi_page_block_size(page) - MI_PADDING_SIZE);
   }
 }
 
+// zero initialized small block
+mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_small(size);
+  if (p != NULL) {
+    _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
+  }
+  return p;
+}
+
 void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
   void* p = mi_heap_malloc(heap,size);
   if (zero && p != NULL) {
@@ -182,9 +181,9 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 }
 #endif
 
-#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
+#if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING);
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING_SIZE);
   mi_block_t* const decoded = mi_block_nextx(page, padding, page->key[0], page->key[1]);
   if (decoded != block) {
     _mi_error_message(EFAULT, "buffer overflow in heap block %p: write after %zu bytes\n", block, page->xblock_size);
@@ -285,7 +284,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
   #if (MI_DEBUG)
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page) - MI_PADDING);
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page) - MI_PADDING_SIZE);
   #endif
 
   // and push it on the free list
@@ -411,10 +410,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
   if (p==NULL) return 0;
   const mi_segment_t* segment = _mi_ptr_segment(p);
   const mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t size = mi_page_block_size(page);
-#if defined(MI_PADDING)
-  size -= MI_PADDING;
-#endif
+  size_t size = mi_page_block_size(page) - MI_PADDING_SIZE;  
   if (mi_unlikely(mi_page_has_aligned(page))) {
     ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
diff --git a/src/options.c b/src/options.c
index 7559a4b5..0484c183 100644
--- a/src/options.c
+++ b/src/options.c
@@ -67,10 +67,10 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose

From 724602b78b1c4a7896c8b615cddbe43358f27801 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 31 Jan 2020 17:27:45 -0800
Subject: [PATCH 09/62] enable page-reset by default

---
 src/options.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/options.c b/src/options.c
index 0484c183..7559a4b5 100644
--- a/src/options.c
+++ b/src/options.c
@@ -67,10 +67,10 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose

From 8422ab125da114e8cad967889860cc9943b8cca0 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 31 Jan 2020 17:28:26 -0800
Subject: [PATCH 10/62] improve messages; fix reset size calculation on large
 pages

---
 src/arena.c   |  2 +-
 src/os.c      |  4 ++--
 src/segment.c | 10 +++++++++-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 7bf8099b..724fc52c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -283,7 +283,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
     _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
     return ENOMEM;
   }
-  _mi_verbose_message("reserved %zu gb huge pages on numa node %i (of the %zu gb requested)\n", pages_reserved, numa_node, pages);
+  _mi_verbose_message("numa node %i: reserved %zu gb huge pages (of the %zu gb requested)\n", numa_node, pages_reserved, pages);
 
   size_t bcount = mi_block_count_of_size(hsize);
   size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
diff --git a/src/os.c b/src/os.c
index b8dfaa70..970eeb94 100644
--- a/src/os.c
+++ b/src/os.c
@@ -851,7 +851,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
     else {
       // fall back to regular large pages
       mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+      _mi_warning_message("unable to allocate using huge (1gb) pages, trying large (2mb) pages instead (status 0x%lx)\n", err);
     }
   }
   // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
@@ -892,7 +892,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
     // see: <https://lkml.org/lkml/2017/2/9/875>
     long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
     if (err != 0) {
-      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
+      _mi_warning_message("failed to bind huge (1gb) pages to numa node %d: %s\n", numa_node, strerror(errno));
     }
   }
   return p;
diff --git a/src/segment.c b/src/segment.c
index c7a9662b..01a8a693 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -247,6 +247,7 @@ static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, m
 static void mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
 {
   mi_assert_internal(page->is_reset);
+  mi_assert_internal(page->is_committed);
   mi_assert_internal(!segment->mem_is_fixed);
   page->is_reset = false;
   size_t psize;
@@ -779,10 +780,14 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   // note: must come after setting `segment_in_use` to false but before block_size becomes 0
   //mi_page_reset(segment, page, 0 /*used_size*/, tld);
 
-  // zero the page data, but not the segment fields and block_size (for page size calculations)
+  // zero the page data, but not the segment fields and capacity, and block_size (for page size calculations)
   uint32_t block_size = page->xblock_size;
+  uint16_t capacity = page->capacity;
+  uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  page->capacity = capacity;
+  page->reserved = reserved;
   page->xblock_size = block_size;
   segment->used--;
 
@@ -790,6 +795,9 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   if (allow_reset) {  
     mi_pages_reset_add(segment, page, tld);
   }
+
+  page->capacity = 0;  // after reset there can be zero'd now
+  page->reserved = 0;
 }
 
 void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)

From 68112a2751d4b4388d91381fce3afb79e3c00eec Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 31 Jan 2020 20:34:24 -0800
Subject: [PATCH 11/62] better padding implementation, more precise statistics

---
 include/mimalloc-internal.h |  12 ++++-
 include/mimalloc-types.h    |  28 +++++-----
 src/alloc-aligned.c         |   2 +-
 src/alloc.c                 | 102 ++++++++++++++++++++----------------
 src/page.c                  |   6 +--
 test/main-override-static.c |   2 +-
 test/test-stress.c          |   2 +-
 7 files changed, 89 insertions(+), 65 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index c7d7a1da..2c8d767c 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -310,8 +310,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) {
 ----------------------------------------------------------- */
 
 static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
-  mi_assert_internal(size <= MI_SMALL_SIZE_MAX);
-  return heap->pages_free_direct[_mi_wsize_from_size(size)];
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
 }
 
 // Get the page belonging to a certain size class
@@ -375,6 +377,12 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
   }
 }
 
+// Get the client usable block size of a page (without padding etc)
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
   return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 9cda377e..8712c54a 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -54,16 +54,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
-// Reserve extra padding at the end of each block; must be a multiple of `2*sizeof(intptr_t)`!
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
 // If free lists are encoded, the padding is checked if it was modified on free.
 #if (!defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1))
-#define MI_PADDING    
+#define MI_PADDING     
 #endif
 
+// The padding size must be at least `sizeof(intptr_t)`!
 #if defined(MI_PADDING)
-#define MI_PADDING_SIZE  (2*sizeof(intptr_t))
+#define MI_PADDING_WSIZE  1
 #else
-#define MI_PADDING_SIZE  0
+#define MI_PADDING_WSIZE  0
 #endif
 
 
@@ -94,11 +95,13 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
 #define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+#define MI_PADDING_SIZE (MI_PADDING_WSIZE * MI_INTPTR_SIZE)
 
 #define KiB     ((size_t)1024)
 #define MiB     (KiB*KiB)
 #define GiB     (MiB*KiB)
 
+
 // ------------------------------------------------------
 // Main internal data-structures
 // ------------------------------------------------------
@@ -306,19 +309,20 @@ typedef struct mi_random_cxt_s {
   int      output_available;
 } mi_random_ctx_t;
 
+#define MI_PAGES_DIRECT  (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
 
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_SMALL_WSIZE_MAX + 2];   // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];                      // queue of pages for each size class (or "bin")
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
   volatile _Atomic(mi_block_t*) thread_delayed_free;
-  uintptr_t             thread_id;                                   // thread this heap belongs too
-  uintptr_t             cookie;                                      // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             key[2];                                      // twb random keys used to encode the `thread_delayed_free` list
-  mi_random_ctx_t       random;                                      // random number context used for secure allocation
-  size_t                page_count;                                  // total number of pages in the `pages` queues.
-  bool                  no_reclaim;                                  // `true` if this heap should not reclaim abandoned pages
+  uintptr_t             thread_id;                           // thread this heap belongs too
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             key[2];                              // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
 };
 
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 3749fbc6..05dd5fc6 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -25,7 +25,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
-  if (mi_likely(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE))) {
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
     mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
     const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
     if (mi_likely(page->free != NULL && is_aligned))
diff --git a/src/alloc.c b/src/alloc.c
index 34e65765..999a6ca5 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -38,14 +38,15 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   block->next = 0;  // don't leak internal data
 #endif
 #if (MI_STAT>1)
-  if(size <= MI_LARGE_OBJ_SIZE_MAX) {
-    size_t bin = _mi_bin(size);
+  const size_t bsize = mi_page_usable_block_size(page);
+  if(bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    const size_t bin = _mi_bin(bsize);
     mi_heap_stat_increase(heap,normal[bin], 1);
   }
 #endif
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
   mi_assert_internal((MI_PADDING_SIZE % sizeof(mi_block_t*)) == 0);
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING_SIZE);
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   mi_block_set_nextx(page, padding, block, page->key[0], page->key[1]);
 #endif
   return block;
@@ -53,10 +54,18 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 
 // allocate a small block
 extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE));
+  mi_assert(heap!=NULL);
+  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
   mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
   void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
-  mi_assert_internal(p==NULL || mi_page_block_size(_mi_ptr_page(p)) >= (size + MI_PADDING_SIZE));
+  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  }
+  #endif
   return p;
 }
 
@@ -66,23 +75,22 @@ extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexc
 
 // The main allocation function
 extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(heap!=NULL);
-  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-  void* p;
-  if (mi_likely(size <= (MI_SMALL_SIZE_MAX - MI_PADDING_SIZE))) {
-    p = mi_heap_malloc_small(heap, size);
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    return mi_heap_malloc_small(heap, size);
   }
   else {
-    p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);
+    mi_assert(heap!=NULL);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);
+    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    }
+    #endif
+    return p;
   }
-  #if MI_STAT>1
-  if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
-    mi_heap_stat_increase( heap, malloc, mi_good_size(size) );  // overestimate for aligned sizes
-  }
-  #endif
-  mi_assert_internal(p == NULL || mi_page_block_size(_mi_ptr_page(p)) >= (size + MI_PADDING_SIZE));
-  return p;
 }
 
 extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
@@ -91,20 +99,20 @@ extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
 
 
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
-  // note: we need to initialize the whole block to zero, not just size
+  // note: we need to initialize the whole usable block size to zero, not just the requested size,
   // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
   UNUSED_RELEASE(size);
   mi_assert_internal(p != NULL);
-  mi_assert_internal(mi_page_block_size(page) >= (size + MI_PADDING_SIZE)); // size can be zero
+  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
   mi_assert_internal(_mi_ptr_page(p)==page);
   if (page->is_zero) {
     // already zero initialized memory?
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page) - MI_PADDING_SIZE));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_page_usable_block_size(page)));
   }
   else {
     // otherwise memset
-    memset(p, 0, mi_page_block_size(page) - MI_PADDING_SIZE);
+    memset(p, 0, mi_page_usable_block_size(page));
   }
 }
 
@@ -183,10 +191,11 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + page->xblock_size - MI_PADDING_SIZE);
+  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   mi_block_t* const decoded = mi_block_nextx(page, padding, page->key[0], page->key[1]);
   if (decoded != block) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p: write after %zu bytes\n", block, page->xblock_size);
+    const ptrdiff_t size = (uint8_t*)padding - (uint8_t*)block;
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p: write after %zd bytes\n", block, size );
   }
 }
 #else 
@@ -208,7 +217,7 @@ static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_pag
   mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
 
   // claim it and free
-  mi_heap_t* heap = mi_get_default_heap();
+  mi_heap_t* const heap = mi_get_default_heap();
   // paranoia: if this it the last reference, the cas should always succeed
   if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
     mi_block_set_next(page, block, page->free);
@@ -216,8 +225,8 @@ static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_pag
     page->used--;
     page->is_zero = false;
     mi_assert(page->used == 0);
-    mi_tld_t* tld = heap->tld;
-    const size_t bsize = mi_page_block_size(page);
+    mi_tld_t* const tld = heap->tld;
+    const size_t bsize = mi_page_usable_block_size(page);
     if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
       _mi_stat_decrease(&tld->stats.giant, bsize);
     }
@@ -232,14 +241,17 @@ static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_pag
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
   // huge page segments are always abandoned and can be freed immediately
-  mi_segment_t* segment = _mi_page_segment(page);
+  mi_segment_t* const segment = _mi_page_segment(page);
   if (segment->page_kind==MI_PAGE_HUGE) {
     mi_free_huge_block_mt(segment, page, block);
     return;
   }
 
+  // The padding check accesses the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
 
+  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
   mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
   bool use_delayed;
@@ -259,7 +271,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 
   if (mi_unlikely(use_delayed)) {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = mi_page_heap(page);
+    mi_heap_t* const heap = mi_page_heap(page);
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
@@ -311,15 +323,15 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
-  size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  size_t adjust = (diff % mi_page_block_size(page));
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
 
 static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) {
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  mi_block_t* block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   _mi_free_block(page, local, block);
 }
 
@@ -356,12 +368,12 @@ void mi_free(void* p) mi_attr_noexcept
   mi_page_t* const page = _mi_segment_page_of(segment, p);
 
 #if (MI_STAT>1)
-  mi_heap_t* heap = mi_heap_get_default();
-  mi_heap_stat_decrease(heap, malloc, mi_usable_size(p));
-  if (page->xblock_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal[_mi_bin(page->xblock_size)], 1);
-  }
-  // huge page stat is accounted for in `_mi_page_retire`
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+  mi_heap_stat_decrease(heap, malloc, bsize);
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
+    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
+  }  
 #endif
 
   if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
@@ -385,10 +397,10 @@ void mi_free(void* p) mi_attr_noexcept
 
 bool _mi_free_delayed_block(mi_block_t* block) {
   // get segment and page
-  const mi_segment_t* segment = _mi_ptr_segment(block);
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* page = _mi_segment_page_of(segment, block);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
 
   // Clear the no-delayed flag so delayed freeing is used again for this page.
   // This must be done before collecting the free lists on this page -- otherwise
@@ -408,9 +420,9 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 // Bytes available in a block
 size_t mi_usable_size(const void* p) mi_attr_noexcept {
   if (p==NULL) return 0;
-  const mi_segment_t* segment = _mi_ptr_segment(p);
-  const mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t size = mi_page_block_size(page) - MI_PADDING_SIZE;  
+  const mi_segment_t* const segment = _mi_ptr_segment(p);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const size_t size = mi_page_usable_block_size(page);  
   if (mi_unlikely(mi_page_has_aligned(page))) {
     ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
diff --git a/src/page.c b/src/page.c
index edbc7411..57adbc91 100644
--- a/src/page.c
+++ b/src/page.c
@@ -752,7 +752,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
   mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);
   mi_page_t* page = mi_page_fresh_alloc(heap,NULL,block_size);
   if (page != NULL) {
-    const size_t bsize = mi_page_block_size(page);
+    const size_t bsize = mi_page_usable_block_size(page);
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(bsize >= size);
     mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
@@ -761,11 +761,11 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
     mi_page_set_heap(page, NULL);
 
     if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      _mi_stat_increase(&heap->tld->stats.giant, block_size);
+      _mi_stat_increase(&heap->tld->stats.giant, bsize);
       _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
     }
     else {
-      _mi_stat_increase(&heap->tld->stats.huge, block_size);
+      _mi_stat_increase(&heap->tld->stats.huge, bsize);
       _mi_stat_counter_increase(&heap->tld->stats.huge_count, 1);
     }
   }
diff --git a/test/main-override-static.c b/test/main-override-static.c
index a1c3edee..4bbff192 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -19,7 +19,7 @@ int main() {
   // double_free1();
   // double_free2();
   // corrupt_free();
-  // block_overflow1();
+  //block_overflow1();
 
   void* p1 = malloc(78);
   void* p2 = malloc(24);
diff --git a/test/test-stress.c b/test/test-stress.c
index 1b559a59..05254e5d 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -27,7 +27,7 @@ terms of the MIT license.
 // argument defaults
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 10;      // scaling factor
-static int ITER    = 50;      // N full iterations destructing and re-creating all threads
+static int ITER    = 10;      // N full iterations destructing and re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int SCALE   = 100;  // scaling factor

From 40f1e1e07b9452ad46ae47dfb3887e7f5cb6ca4d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 31 Jan 2020 23:39:51 -0800
Subject: [PATCH 12/62] byte-precise heap block overflow checking with encoded
 padding

---
 ide/vs2019/mimalloc.vcxproj |   2 +-
 include/mimalloc-internal.h |   3 +-
 include/mimalloc-types.h    |  30 +++++---
 src/alloc.c                 | 135 +++++++++++++++++++++++++++---------
 src/init.c                  |  10 ++-
 test/main-override-static.c |   6 +-
 test/test-stress.c          |   2 +-
 7 files changed, 138 insertions(+), 50 deletions(-)

diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index a1372204..fad6de5d 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -248,4 +248,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 2c8d767c..be10bdc3 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -377,7 +377,8 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
   }
 }
 
-// Get the client usable block size of a page (without padding etc)
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
 static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 8712c54a..ccb37fcf 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -49,23 +49,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 // Encoded free lists allow detection of corrupted free lists
-// and can detect buffer overflows and double `free`s.
+// and can detect buffer overflows, modify after free, and double `free`s.
 #if (MI_SECURE>=3 || MI_DEBUG>=1)
 #define MI_ENCODE_FREELIST  1
 #endif
 
 // Reserve extra padding at the end of each block to be more resilient against heap block overflows.
-// If free lists are encoded, the padding is checked if it was modified on free.
+// If free lists are encoded, the padding can detect byte-precise buffer overflow on free.
 #if (!defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1))
 #define MI_PADDING     
 #endif
 
-// The padding size must be at least `sizeof(intptr_t)`!
-#if defined(MI_PADDING)
-#define MI_PADDING_WSIZE  1
-#else
-#define MI_PADDING_WSIZE  0
-#endif
 
 
 // ------------------------------------------------------
@@ -95,7 +89,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
 #define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
-#define MI_PADDING_SIZE (MI_PADDING_WSIZE * MI_INTPTR_SIZE)
 
 #define KiB     ((size_t)1024)
 #define MiB     (KiB*KiB)
@@ -309,7 +302,22 @@ typedef struct mi_random_cxt_s {
   int      output_available;
 } mi_random_ctx_t;
 
-#define MI_PAGES_DIRECT  (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+// In debug mode there is a padding stucture at the end of the blocks to check for buffer overflows
+#if defined(MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t block;  // (encoded) lower 32 bits of the block address. (to check validity of the block)
+  uint32_t delta;  // (encoded) padding bytes before the block. (mi_usable_size(p) - decode(delta) == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
 
 // A heap owns a set of pages.
 struct mi_heap_s {
@@ -333,7 +341,7 @@ struct mi_heap_s {
 
 #define MI_DEBUG_UNINIT     (0xD0)
 #define MI_DEBUG_FREED      (0xDF)
-
+#define MI_DEBUG_PADDING    (0xDE)
 
 #if (MI_DEBUG)
 // use our own assertion to print without memory allocation
diff --git a/src/alloc.c b/src/alloc.c
index 999a6ca5..54057661 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { 
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* block = page->free;
   if (mi_unlikely(block == NULL)) {
@@ -29,25 +29,29 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
-  page->free = mi_block_next(page,block);
+  page->free = mi_block_next(page, block);
   page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
-#if (MI_DEBUG!=0)
+#if (MI_DEBUG>0)
   if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
 #elif (MI_SECURE!=0)
   block->next = 0;  // don't leak internal data
 #endif
 #if (MI_STAT>1)
   const size_t bsize = mi_page_usable_block_size(page);
-  if(bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     const size_t bin = _mi_bin(bsize);
-    mi_heap_stat_increase(heap,normal[bin], 1);
+    mi_heap_stat_increase(heap, normal[bin], 1);
   }
 #endif
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
-  mi_assert_internal((MI_PADDING_SIZE % sizeof(mi_block_t*)) == 0);
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  mi_block_set_nextx(page, padding, block, page->key[0], page->key[1]);
+  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  padding->block = (uint32_t)(((uintptr_t)block >> MI_INTPTR_SHIFT) ^ page->key[0]);
+  padding->delta = (uint32_t)(delta ^ page->key[1]);
+  uint8_t* fill = (uint8_t*)padding - delta;
+  for (ptrdiff_t i = 0; i < delta; i++) { fill[i] = MI_DEBUG_PADDING; }
 #endif
   return block;
 }
@@ -101,18 +105,18 @@ extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
   // note: we need to initialize the whole usable block size to zero, not just the requested size,
   // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  UNUSED_RELEASE(size);
+  UNUSED(size);
   mi_assert_internal(p != NULL);
   mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
   mi_assert_internal(_mi_ptr_page(p)==page);
   if (page->is_zero) {
     // already zero initialized memory?
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_page_usable_block_size(page)));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
   }
   else {
     // otherwise memset
-    memset(p, 0, mi_page_usable_block_size(page));
+    memset(p, 0, mi_usable_size(p));
   }
 }
 
@@ -189,20 +193,82 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 }
 #endif
 
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* const padding = (mi_block_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  mi_block_t* const decoded = mi_block_nextx(page, padding, page->key[0], page->key[1]);
-  if (decoded != block) {
-    const ptrdiff_t size = (uint8_t*)padding - (uint8_t*)block;
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p: write after %zd bytes\n", block, size );
+static mi_padding_t mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_padding_t pad;
+  pad.block = padding->block ^ (uint32_t)page->key[0];
+  pad.delta = padding->delta ^ (uint32_t)page->key[1];
+  return pad;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
+  return bsize - pad.delta;
+}
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  const mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
+  *size = *wrong = bsize;
+  if ((uint32_t)((uintptr_t)block >> MI_INTPTR_SHIFT) != pad.block) return false;
+  if (pad.delta > bsize) return false;  // can be equal for zero-sized allocation!
+  *size = bsize - pad.delta;
+  uint8_t* fill = (uint8_t*)block + bsize - pad.delta;
+  for (uint32_t i = 0; i < pad.delta; i++) {
+    if (fill[i] != MI_DEBUG_PADDING) {
+      *wrong = bsize - pad.delta + i;
+      return false;
+    }
   }
+  return true;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
+  if ((bsize - pad.delta) >= min_size) return;
+  mi_assert_internal(bsize >= min_size);
+  ptrdiff_t delta = (bsize - min_size);
+  mi_assert_internal(delta >= 0 && delta < (ptrdiff_t)bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  padding->delta = (uint32_t)(delta ^ page->key[1]);
 }
 #else 
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   UNUSED(page);
   UNUSED(block);
 }
+
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  UNUSED(page);
+  UNUSED(block);
+  UNUSED(min_size);
+}
 #endif
 
 // ------------------------------------------------------
@@ -240,6 +306,14 @@ static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_pag
 // multi-threaded free
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  #if (MI_DEBUG!=0)
+  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  #endif
+
   // huge page segments are always abandoned and can be freed immediately
   mi_segment_t* const segment = _mi_page_segment(page);
   if (segment->page_kind==MI_PAGE_HUGE) {
@@ -247,10 +321,6 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     return;
   }
 
-  // The padding check accesses the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-
   // Try to put the block on either the page-local thread free list, or the heap delayed free list.
   mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
@@ -295,15 +365,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 // regular free
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
-  #if (MI_DEBUG)
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page) - MI_PADDING_SIZE);
-  #endif
-
   // and push it on the free list
   if (mi_likely(local)) {
     // owning thread can free a block directly
     if (mi_unlikely(mi_check_is_double_free(page, block))) return;
     mi_check_padding(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
@@ -312,7 +381,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
     }
     else if (mi_unlikely(mi_page_is_in_full(page))) {
       _mi_page_unfull(page);
-    }
+    }    
   }
   else {
     _mi_free_block_mt(page,block);
@@ -366,6 +435,7 @@ void mi_free(void* p) mi_attr_noexcept
 
   const uintptr_t tid = _mi_thread_id();
   mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_block_t* const block = (mi_block_t*)p;
 
 #if (MI_STAT>1)
   mi_heap_t* const heap = mi_heap_get_default();
@@ -377,16 +447,18 @@ void mi_free(void* p) mi_attr_noexcept
 #endif
 
   if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned
-    mi_block_t* const block = (mi_block_t*)p;
+    // local, and not full or aligned    
     if (mi_unlikely(mi_check_is_double_free(page,block))) return;
     mi_check_padding(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
     page->used--;
     if (mi_unlikely(mi_page_all_free(page))) {
       _mi_page_retire(page);
-    }
+    }    
   }
   else {
     // non-local, aligned blocks, or a full page; use the more generic path
@@ -422,9 +494,10 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
   if (p==NULL) return 0;
   const mi_segment_t* const segment = _mi_ptr_segment(p);
   const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  const size_t size = mi_page_usable_block_size(page);  
+  const mi_block_t* const block = (const mi_block_t*)p;
+  const size_t size = mi_page_usable_size_of(page, block);  
   if (mi_unlikely(mi_page_has_aligned(page))) {
-    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
+    ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
     return (size - adjust);
   }
diff --git a/src/init.c b/src/init.c
index f8411187..c657fa4c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -31,8 +31,14 @@ const mi_page_t _mi_page_empty = {
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
-#define MI_SMALL_PAGES_EMPTY  \
-  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+
+#if defined(MI_PADDING) && (MI_INTPTR_SIZE >= 8)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#elif defined(MI_PADDING) 
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#else
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
+#endif
 
 
 // Empty page queues for every bin
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 4bbff192..839a5d2f 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -19,7 +19,7 @@ int main() {
   // double_free1();
   // double_free2();
   // corrupt_free();
-  //block_overflow1();
+  // block_overflow1();
 
   void* p1 = malloc(78);
   void* p2 = malloc(24);
@@ -44,8 +44,8 @@ int main() {
 }
 
 static void block_overflow1() {
-  void* p = mi_malloc(16);
-  memset(p, 0, 17);
+  uint8_t* p = (uint8_t*)mi_malloc(17);
+  p[18] = 0;
   free(p);
 }
 
diff --git a/test/test-stress.c b/test/test-stress.c
index 05254e5d..1b559a59 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -27,7 +27,7 @@ terms of the MIT license.
 // argument defaults
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 10;      // scaling factor
-static int ITER    = 10;      // N full iterations destructing and re-creating all threads
+static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int SCALE   = 100;  // scaling factor

From aa68b8cbc7830bebbaec98f8c851a5f358993614 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 12:15:12 -0800
Subject: [PATCH 13/62] improve encoding of padding canary and buffer overflow
 detection

---
 include/mimalloc-internal.h | 33 ++++++++++++++---------
 include/mimalloc-types.h    | 25 +++++++++--------
 src/alloc.c                 | 54 ++++++++++++++++++++-----------------
 src/heap.c                  |  6 ++---
 src/init.c                  | 12 ++++-----
 src/page.c                  | 14 +++++-----
 6 files changed, 78 insertions(+), 66 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index be10bdc3..9bba6e8f 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -519,30 +519,37 @@ static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
   return ((x >> shift) | (x << (MI_INTPTR_BITS - shift)));
 }
 
-static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t key1, uintptr_t key2 ) {
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (mi_unlikely(p==null) ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
   #ifdef MI_ENCODE_FREELIST
-  mi_block_t* b = (mi_block_t*)(mi_rotr(block->next - key1, key1) ^ key2);
-  if (mi_unlikely((void*)b==null)) { b = NULL; }
-  return b;
+  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
   #else
-  UNUSED(key1); UNUSED(key2); UNUSED(null);
+  UNUSED(keys); UNUSED(null);
   return (mi_block_t*)block->next;
   #endif
 }
 
-static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t key1, uintptr_t key2) {
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
   #ifdef MI_ENCODE_FREELIST
-  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
-  block->next = mi_rotl((uintptr_t)next ^ key2, key1) + key1;
+  block->next = mi_ptr_encode(null, next, keys);
   #else
-  UNUSED(key1); UNUSED(key2); UNUSED(null);
+  UNUSED(keys); UNUSED(null);
   block->next = (mi_encoded_t)next;
   #endif
 }
 
 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
   #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page,block,page->key[0],page->key[1]);
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
   // check for free list corruption: is `next` at least in the same page?
   // TODO: check if `next` is `page->block_size` aligned?
   if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
@@ -552,16 +559,16 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
   return next;
   #else
   UNUSED(page);
-  return mi_block_nextx(page,block,0,0);
+  return mi_block_nextx(page,block,NULL);
   #endif
 }
 
 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
   #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page,block,next, page->key[0], page->key[1]);
+  mi_block_set_nextx(page,block,next, page->keys);
   #else
   UNUSED(page);
-  mi_block_set_nextx(page,block, next,0,0);
+  mi_block_set_nextx(page,block,next,NULL);
   #endif
 }
 
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index ccb37fcf..71f3ae80 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -48,25 +48,24 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 #endif
 
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect byte-precise buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_DEBUG>=1)
+#define MI_PADDING  1
+#endif
+
+
 // Encoded free lists allow detection of corrupted free lists
 // and can detect buffer overflows, modify after free, and double `free`s.
-#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#if (MI_SECURE>=3 || MI_DEBUG>=1 || defined(MI_PADDING))
 #define MI_ENCODE_FREELIST  1
 #endif
 
-// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
-// If free lists are encoded, the padding can detect byte-precise buffer overflow on free.
-#if (!defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1))
-#define MI_PADDING     
-#endif
-
-
 
 // ------------------------------------------------------
 // Platform specific values
 // ------------------------------------------------------
 
-
 // ------------------------------------------------------
 // Size of a pointer.
 // We assume that `sizeof(void*)==sizeof(intptr_t)`
@@ -218,7 +217,7 @@ typedef struct mi_page_s {
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   #ifdef MI_ENCODE_FREELIST
-  uintptr_t             key[2];            // two random keys to encode the free lists (see `_mi_block_next`)
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
   #endif
   uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
   uint32_t              xblock_size;       // size available in each block (always `>0`) 
@@ -306,8 +305,8 @@ typedef struct mi_random_cxt_s {
 // In debug mode there is a padding stucture at the end of the blocks to check for buffer overflows
 #if defined(MI_PADDING)
 typedef struct mi_padding_s {
-  uint32_t block;  // (encoded) lower 32 bits of the block address. (to check validity of the block)
-  uint32_t delta;  // (encoded) padding bytes before the block. (mi_usable_size(p) - decode(delta) == exact allocated bytes)
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
 } mi_padding_t;
 #define MI_PADDING_SIZE   (sizeof(mi_padding_t))
 #define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
@@ -327,7 +326,7 @@ struct mi_heap_s {
   volatile _Atomic(mi_block_t*) thread_delayed_free;
   uintptr_t             thread_id;                           // thread this heap belongs too
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             key[2];                              // two random keys used to encode the `thread_delayed_free` list
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
diff --git a/src/alloc.c b/src/alloc.c
index 54057661..134f5b85 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -48,10 +48,11 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
   mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  padding->block = (uint32_t)(((uintptr_t)block >> MI_INTPTR_SHIFT) ^ page->key[0]);
-  padding->delta = (uint32_t)(delta ^ page->key[1]);
+  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+  padding->delta  = (uint32_t)(delta);
   uint8_t* fill = (uint8_t*)padding - delta;
-  for (ptrdiff_t i = 0; i < delta; i++) { fill[i] = MI_DEBUG_PADDING; }
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
 #endif
   return block;
 }
@@ -175,7 +176,7 @@ static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, con
 }
 
 static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page, block, page->key[0], page->key[1]); // pretend it is freed, and get the decoded first field
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
   if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
       (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
   {
@@ -198,33 +199,35 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 // ---------------------------------------------------------------------------
 
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
-static mi_padding_t mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* bsize) {
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
   *bsize = mi_page_usable_block_size(page);
   const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  mi_padding_t pad;
-  pad.block = padding->block ^ (uint32_t)page->key[0];
-  pad.delta = padding->delta ^ (uint32_t)page->key[1];
-  return pad;
+  *delta = padding->delta;
+  return ((uint32_t)mi_ptr_encode(page,block,page->keys) == padding->canary && *delta <= *bsize);
 }
 
 // Return the exact usable size of a block.
 static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
   size_t bsize;
-  mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
-  return bsize - pad.delta;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0); 
 }
 
 static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
   size_t bsize;
-  const mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
   *size = *wrong = bsize;
-  if ((uint32_t)((uintptr_t)block >> MI_INTPTR_SHIFT) != pad.block) return false;
-  if (pad.delta > bsize) return false;  // can be equal for zero-sized allocation!
-  *size = bsize - pad.delta;
-  uint8_t* fill = (uint8_t*)block + bsize - pad.delta;
-  for (uint32_t i = 0; i < pad.delta; i++) {
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  uint8_t* fill = (uint8_t*)block + bsize - delta;
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+  for (size_t i = 0; i < maxpad; i++) {
     if (fill[i] != MI_DEBUG_PADDING) {
-      *wrong = bsize - pad.delta + i;
+      *wrong = bsize - delta + i;
       return false;
     }
   }
@@ -245,13 +248,16 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // so it will later not trigger an overflow error in `mi_free_block`.
 static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
   size_t bsize;
-  mi_padding_t pad = mi_page_decode_padding(page, block, &bsize);
-  if ((bsize - pad.delta) >= min_size) return;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
   mi_assert_internal(bsize >= min_size);
-  ptrdiff_t delta = (bsize - min_size);
-  mi_assert_internal(delta >= 0 && delta < (ptrdiff_t)bsize);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
   mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  padding->delta = (uint32_t)(delta ^ page->key[1]);
+  padding->delta = (uint32_t)new_delta;
 }
 #else 
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
@@ -348,7 +354,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_block_t* dfree;
       do {
         dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-        mi_block_set_nextx(heap,block,dfree, heap->key[0], heap->key[1]);
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
       } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
     }
 
diff --git a/src/heap.c b/src/heap.c
index e76a147c..1c287db2 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -194,9 +194,9 @@ mi_heap_t* mi_heap_new(void) {
   heap->tld = bheap->tld;
   heap->thread_id = _mi_thread_id();
   _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie = _mi_heap_random_next(heap) | 1;
-  heap->key[0] = _mi_heap_random_next(heap);
-  heap->key[1] = _mi_heap_random_next(heap);
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->keys[0] = _mi_heap_random_next(heap);
+  heap->keys[1] = _mi_heap_random_next(heap);
   heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
   return heap;
 }
diff --git a/src/init.c b/src/init.c
index c657fa4c..fc62880e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -173,9 +173,9 @@ static bool _mi_heap_init(void) {
     memcpy(heap, &_mi_heap_empty, sizeof(*heap));
     heap->thread_id = _mi_thread_id();
     _mi_random_init(&heap->random);
-    heap->cookie = _mi_heap_random_next(heap) | 1;
-    heap->key[0] = _mi_heap_random_next(heap);
-    heap->key[1] = _mi_heap_random_next(heap);
+    heap->cookie  = _mi_heap_random_next(heap) | 1;
+    heap->keys[0] = _mi_heap_random_next(heap);
+    heap->keys[1] = _mi_heap_random_next(heap);
     heap->tld = tld;    
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
@@ -418,9 +418,9 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
   _mi_random_init(&_mi_heap_main.random);
   #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
+  _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
+  _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+  _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
   #endif
   mi_process_setup_auto_thread_done();
   _mi_os_init();
diff --git a/src/page.c b/src/page.c
index 57adbc91..23a04a84 100644
--- a/src/page.c
+++ b/src/page.c
@@ -281,7 +281,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
 
   // and free them all
   while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->key[0], heap->key[1]);
+    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
     // use internal free instead of regular one to keep stats etc correct
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
@@ -289,7 +289,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
       mi_block_t* dfree;
       do {
         dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-        mi_block_set_nextx(heap, block, dfree, heap->key[0], heap->key[1]);
+        mi_block_set_nextx(heap, block, dfree, heap->keys);
       } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
     }
     block = next;
@@ -348,7 +348,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 
 #if MI_DEBUG>1
   // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->key[0], pheap->key[1])) {
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
     mi_assert_internal(_mi_ptr_page(block) != page);
   }
 #endif
@@ -609,8 +609,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   #ifdef MI_ENCODE_FREELIST
-  page->key[0] = _mi_heap_random_next(heap);
-  page->key[1] = _mi_heap_random_next(heap);
+  page->keys[0] = _mi_heap_random_next(heap);
+  page->keys[1] = _mi_heap_random_next(heap);
   #endif
   page->is_zero = page->is_zero_init;
 
@@ -623,8 +623,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->retire_expire == 0);
   mi_assert_internal(!mi_page_has_aligned(page));
   #if (MI_ENCODE_FREELIST)
-  mi_assert_internal(page->key[0] != 0);
-  mi_assert_internal(page->key[1] != 0);
+  mi_assert_internal(page->keys[0] != 0);
+  mi_assert_internal(page->keys[1] != 0);
   #endif
   mi_assert_expensive(mi_page_is_valid_init(page));
 

From 60cfc623be8838ca32aad627c13f54aa53c18c5f Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 14:29:12 -0800
Subject: [PATCH 14/62] fix zero initialization of blocks under 8 bytes when
 padding check is active

---
 src/alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index 1f053db9..61f34353 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -110,8 +110,8 @@ void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
   mi_assert_internal(p != NULL);
   mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
   mi_assert_internal(_mi_ptr_page(p)==page);
-  if (page->is_zero) {
-    // already zero initialized memory?
+  if (page->is_zero && size > sizeof(mi_block_t)) {
+    // already zero initialized memory
     ((mi_block_t*)p)->next = 0;  // clear the free list pointer
     mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
   }

From 5135c2b96a5acd08d2639cf70031f07b08c010f6 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 14:29:55 -0800
Subject: [PATCH 15/62] add test-api to vs solution

---
 ide/vs2019/mimalloc-test-api.vcxproj | 155 +++++++++++++++++++++++++++
 ide/vs2019/mimalloc.sln              |  14 ++-
 2 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100644 ide/vs2019/mimalloc-test-api.vcxproj

diff --git a/ide/vs2019/mimalloc-test-api.vcxproj b/ide/vs2019/mimalloc-test-api.vcxproj
new file mode 100644
index 00000000..812a9cb1
--- /dev/null
+++ b/ide/vs2019/mimalloc-test-api.vcxproj
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{FFF7958F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
+    <RootNamespace>mimalloc-test-api</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+    <ProjectName>mimalloc-test-api</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\test\test-api.c">
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/ide/vs2019/mimalloc.sln b/ide/vs2019/mimalloc.sln
index aeab6b88..fcb938a4 100644
--- a/ide/vs2019/mimalloc.sln
+++ b/ide/vs2019/mimalloc.sln
@@ -1,7 +1,7 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.28010.2016
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.29709.97
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc", "mimalloc.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}"
 EndProject
@@ -13,6 +13,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-test", "m
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-api", "mimalloc-test-api.vcxproj", "{FFF7958F-750E-4C21-A04D-22707CC66878}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -61,6 +63,14 @@ Global
 		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
 		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
 		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From fea903900d7f40c1c9af4f9059dc2fbfaa6a187c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 14:33:24 -0800
Subject: [PATCH 16/62] use __thread locals on linux

---
 include/mimalloc-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 872c5269..7173a189 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-types.h"
 
 #if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__) || defined(__linux__)
+#if defined(__APPLE__)
 #include <pthread.h>
 #define MI_TLS_PTHREADS
 #elif (defined(__OpenBSD__) || defined(__DragonFly__))

From 0989562c2d87aa77f33e590357501fc9d2d485bc Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 16:57:00 -0800
Subject: [PATCH 17/62] add initial fast tls for macOSX

---
 include/mimalloc-internal.h | 32 ++++++++++++++++++++++++++------
 src/init.c                  | 19 +++++++++++++------
 test/test-stress.c          |  4 ++--
 3 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 7173a189..0e3ebed8 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -11,7 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-types.h"
 
 #if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__)
+#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
+#define MI_TLS_OSX_FAST
+#define MI_TLS_OSX_SLOT     94   // seems unused, except in Webkit? See: <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
+#elif defined(__APPLE__)
 #include <pthread.h>
 #define MI_TLS_PTHREADS
 #elif (defined(__OpenBSD__) || defined(__DragonFly__))
@@ -284,14 +287,31 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
 
-#if defined(MI_TLS_PTHREADS)
+#if defined(MI_TLS_OSX_FAST)
+#define MI_TLS_OSX_OFFSET  (MI_TLS_OSX_SLOT*sizeof(void*))
+static inline void* mi_tls_osx_fast_get(void) {
+  void* ret;
+  __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET)));
+  return ret;
+}
+static inline void mi_tls_osx_fast_set(void* value) {
+  __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value));
+}
+#elif defined(MI_TLS_PTHREADS)
 extern pthread_key_t  _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_PTHREADS)
+#if defined(MI_TLS_OSX_FAST) 
+  // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc).
+  // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) <https://github.com/apportable/Foundation/blob/master/System/System/src/pthread_machdep.h>
+  // which seems unused except for the more recent Webkit <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
+  // Use with care.
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREADS)
   // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc` 
   // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed.
   // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good).
@@ -300,9 +320,9 @@ static inline mi_heap_t* mi_get_default_heap(void) {
 #else
   #if defined(MI_TLS_RECURSE_GUARD)
   // On some BSD platforms, like openBSD, the dynamic loader calls `malloc`
-  // to initialize thread local data. To avoid recursion, we need to avoid
-  // accessing the thread local `_mi_default_heap` until our module is loaded
-  // and use the statically allocated main heap until that time.
+  // to initialize thread local data (before our module is loaded). 
+  // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
+  // until our module is loaded and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
   if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main;
   #endif
diff --git a/src/init.c b/src/init.c
index 431b7fee..960cccf1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -260,14 +260,15 @@ static void _mi_thread_done(mi_heap_t* default_heap);
   // use thread local storage keys to detect thread ending
   #include <windows.h>
   #include <fibersapi.h>
-  static DWORD mi_fls_key;
+  static DWORD mi_fls_key = (DWORD)(-1);
   static void NTAPI mi_fls_done(PVOID value) {
     if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
 #elif defined(MI_USE_PTHREADS)
-  // use pthread locol storage keys to detect thread ending
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
   #include <pthread.h>
-  pthread_key_t _mi_heap_default_key;
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
   static void mi_pthread_done(void* value) {
     if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
   }
@@ -287,6 +288,7 @@ static void mi_process_setup_auto_thread_done(void) {
   #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
     mi_fls_key = FlsAlloc(&mi_fls_done);
   #elif defined(MI_USE_PTHREADS)
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
     pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
   #endif
   _mi_heap_set_default_direct(&_mi_heap_main);
@@ -331,9 +333,14 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #if !defined(MI_TLS_PTHREADS)
+  #if defined(MI_TLS_OSX_FAST)
+  mi_tls_osx_fast_set(heap);
+  #elif defined(MI_TLS_PTHREADS)
+  // we use _mi_heap_default_key
+  #else
   _mi_heap_default = heap;
-  #endif  
+  #endif
+
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
   #if defined(_WIN32) && defined(MI_SHARED_LIB)
@@ -342,7 +349,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
     mi_assert_internal(mi_fls_key != 0);
     FlsSetValue(mi_fls_key, heap);
   #elif defined(MI_USE_PTHREADS)
-    // mi_assert_internal(_mi_heap_default_key != 0); // often 0 is also the allocated key
+    mi_assert_internal(_mi_heap_default_key != (pthread_key_t)(-1)); 
     pthread_setspecific(_mi_heap_default_key, heap);
   #endif
 }
diff --git a/test/test-stress.c b/test/test-stress.c
index 1bfc5012..7d8993a0 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -27,7 +27,7 @@ terms of the MIT license.
 // argument defaults
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
 static int SCALE   = 10;      // scaling factor
-static int ITER    =  5;      // N full iterations destructing and re-creating all threads
+static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int SCALE   = 100;  // scaling factor
@@ -250,7 +250,7 @@ int main(int argc, char** argv) {
 #endif
 
   // mi_collect(true);
-  // mi_stats_print(NULL);
+  mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }

From 3f17ac287c575e73e30619f970686b7b63951820 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 17:29:30 -0800
Subject: [PATCH 18/62] add 'nodiscard' attribute to mimalloc interface

---
 include/mimalloc.h          | 154 +++++++++++++++++++-----------------
 src/options.c               |   2 +-
 test/main-override-static.c |   2 +-
 3 files changed, 84 insertions(+), 74 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 94fcd788..f94d9ee7 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -24,6 +24,16 @@ terms of the MIT license. A copy of the license can be found in the file
   #define mi_attr_noexcept
 #endif
 
+#if (__cplusplus >= 201703)
+#define mi_decl_nodiscard   [[nodiscard]]
+#elif (__GNUC__ >= 4)       // includes clang and icc
+#define mi_decl_nodiscard   __attribute__((warn_unused_result))
+#elif (_MSC_VER >= 1700)
+#define mi_decl_nodiscard   _Check_return_
+#else 
+#define mi_decl_nodiscard 
+#endif 
+
 #ifdef _MSC_VER
   #if !defined(MI_SHARED_LIB)
     #define mi_decl_export
@@ -85,15 +95,15 @@ extern "C" {
 // Standard malloc interface
 // ------------------------------------------------------
 
-mi_decl_export mi_decl_allocator void* mi_malloc(size_t size)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export mi_decl_allocator void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_export mi_decl_allocator void* mi_realloc(void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_expand(void* p, size_t newsize)    mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc(size_t size)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc(void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_expand(void* p, size_t newsize)                      mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
-mi_decl_export void  mi_free(void* p)                     mi_attr_noexcept;
-mi_decl_export char* mi_strdup(const char* s)             mi_attr_noexcept;
-mi_decl_export char* mi_strndup(const char* s, size_t n)  mi_attr_noexcept;
-mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept;
+mi_decl_export void  mi_free(void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_strdup(const char* s) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_strndup(const char* s, size_t n) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept;
 
 // ------------------------------------------------------
 // Extended functionality
@@ -101,16 +111,16 @@ mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr
 #define MI_SMALL_WSIZE_MAX  (128)
 #define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
 
-mi_decl_export mi_decl_allocator void* mi_malloc_small(size_t size)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export mi_decl_allocator void* mi_zalloc_small(size_t size)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export mi_decl_allocator void* mi_zalloc(size_t size)         mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
-mi_decl_export mi_decl_allocator void* mi_mallocn(size_t count, size_t size)            mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_export mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
-mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_mallocn(size_t count, size_t size)            mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
-mi_decl_export size_t mi_usable_size(const void* p)   mi_attr_noexcept;
-mi_decl_export size_t mi_good_size(size_t size)       mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
 
 
 // ------------------------------------------------------
@@ -145,14 +155,14 @@ mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_
 // allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
 // -------------------------------------------------------------------------------------
 
-mi_decl_export mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_export mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_export mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
-mi_decl_export mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_export mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
 
 // -------------------------------------------------------------------------------------
@@ -161,7 +171,7 @@ mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t new
 struct mi_heap_s;
 typedef struct mi_heap_s mi_heap_t;
 
-mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
 mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
 mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
 mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
@@ -169,28 +179,28 @@ mi_decl_export mi_heap_t* mi_heap_get_default(void);
 mi_decl_export mi_heap_t* mi_heap_get_backing(void);
 mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
 
-mi_decl_export mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_export mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_export mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
-mi_decl_export mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
-mi_decl_export mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept;
-mi_decl_export mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
 
-mi_decl_export char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept;
-mi_decl_export char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept;
-mi_decl_export char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept;
 
-mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
-mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3) mi_attr_alloc_align(4);
-mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
 
 
 // --------------------------------------------------------------------------------
@@ -240,8 +250,8 @@ typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_
 mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // Experimental
-mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
 
 mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
@@ -297,13 +307,13 @@ typedef enum mi_option_e {
 } mi_option_t;
 
 
-mi_decl_export bool  mi_option_is_enabled(mi_option_t option);
+mi_decl_nodiscard mi_decl_export bool  mi_option_is_enabled(mi_option_t option);
 mi_decl_export void  mi_option_enable(mi_option_t option);
 mi_decl_export void  mi_option_disable(mi_option_t option);
 mi_decl_export void  mi_option_set_enabled(mi_option_t option, bool enable);
 mi_decl_export void  mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-mi_decl_export long  mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long  mi_option_get(mi_option_t option);
 mi_decl_export void  mi_option_set(mi_option_t option, long value);
 mi_decl_export void  mi_option_set_default(mi_option_t option, long value);
 
@@ -313,24 +323,24 @@ mi_decl_export void  mi_option_set_default(mi_option_t option, long value);
 // (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
 // -------------------------------------------------------------------------------------------------------
 
-mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept;
-mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
 mi_decl_export void   mi_cfree(void* p) mi_attr_noexcept;
 mi_decl_export void*  mi__expand(void* p, size_t newsize) mi_attr_noexcept;
 
 mi_decl_export int   mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept;
-mi_decl_export void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
-mi_decl_export void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
-mi_decl_export void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
-mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
 
-mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
-mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
 
-mi_decl_export unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept;
-mi_decl_export unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept;
 mi_decl_export int  mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept;
 mi_decl_export int  mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
 
@@ -339,14 +349,14 @@ mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment)
 mi_decl_export void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept;
 
 // The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
-// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
-mi_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_export void* mi_new_nothrow(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_export void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_export void* mi_new_n(size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(1, 2);
-mi_decl_export void* mi_new_realloc(void* p, size_t newsize) mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).mi_decl_nodiscard ami_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_nothrow(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_n(size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize) mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
 
 #ifdef __cplusplus
 }
@@ -358,7 +368,7 @@ mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_a
 // ---------------------------------------------------------------------------------------------
 #ifdef __cplusplus
 
-#include <limits>      // std::numeric_limits<ptrdiff_t>
+#include <cstdint>     // PTRDIFF_MAX
 #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
 #include <type_traits> // std::true_type
 #include <utility>     // std::forward
@@ -381,10 +391,10 @@ template<class T> struct mi_stl_allocator {
   void              deallocate(T* p, size_type) { mi_free(p); }
 
   #if (__cplusplus >= 201703L)  // C++17
-  T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
-  T* allocate(size_type count, const void*) { return allocate(count); }
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
   #else
-  pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
   #endif
 
   #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
@@ -399,7 +409,7 @@ template<class T> struct mi_stl_allocator {
   void destroy(pointer p) { p->~value_type(); }
   #endif
 
-  size_type     max_size() const mi_attr_noexcept { return (std::numeric_limits<difference_type>::max() / sizeof(value_type)); }
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
   pointer       address(reference x) const        { return &x; }
   const_pointer address(const_reference x) const  { return &x; }
 };
diff --git a/src/options.c b/src/options.c
index 7559a4b5..72a753e1 100644
--- a/src/options.c
+++ b/src/options.c
@@ -85,7 +85,7 @@ void _mi_options_init(void) {
   mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
-    mi_option_get(option); // initialize
+    long l = mi_option_get(option); UNUSED(l); // initialize
     if (option != mi_option_verbose) {
       mi_option_desc_t* desc = &options[option];
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 839a5d2f..950392d0 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -24,7 +24,7 @@ int main() {
   void* p1 = malloc(78);
   void* p2 = malloc(24);
   free(p1);
-  p1 = malloc(8);
+  p1 = mi_malloc(8);
   //char* s = strdup("hello\n");
   free(p2);
   p2 = malloc(16);

From 8aba40a9728fa50f2d541c8712257ff7cc264b18 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 17:41:04 -0800
Subject: [PATCH 19/62] use default declaration for the STL allocator class

---
 ide/vs2019/mimalloc.vcxproj |  2 ++
 include/mimalloc.h          | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index fad6de5d..e18db0c5 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -151,6 +151,7 @@
       <InlineFunctionExpansion>Default</InlineFunctionExpansion>
       <CompileAs>CompileAsCpp</CompileAs>
       <IntrinsicFunctions>true</IntrinsicFunctions>
+      <LanguageStandard>Default</LanguageStandard>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -178,6 +179,7 @@
       <InlineFunctionExpansion>Default</InlineFunctionExpansion>
       <CompileAs>CompileAsCpp</CompileAs>
       <IntrinsicFunctions>true</IntrinsicFunctions>
+      <LanguageStandard>Default</LanguageStandard>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
diff --git a/include/mimalloc.h b/include/mimalloc.h
index f94d9ee7..caf71726 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -25,11 +25,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if (__cplusplus >= 201703)
-#define mi_decl_nodiscard   [[nodiscard]]
-#elif (__GNUC__ >= 4)       // includes clang and icc
-#define mi_decl_nodiscard   __attribute__((warn_unused_result))
+#define mi_decl_nodiscard      [[nodiscard]]
+#elif (__GNUC__ >= 4)         
+#define mi_decl_nodiscard      __attribute__((warn_unused_result))
 #elif (_MSC_VER >= 1700)
-#define mi_decl_nodiscard   _Check_return_
+#define mi_decl_nodiscard      _Check_return_
 #else 
 #define mi_decl_nodiscard 
 #endif 
@@ -325,10 +325,10 @@ mi_decl_export void  mi_option_set_default(mi_option_t option, long value);
 
 mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
-mi_decl_export void   mi_cfree(void* p) mi_attr_noexcept;
-mi_decl_export void*  mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
 
-mi_decl_export int   mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept;
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
 mi_decl_nodiscard mi_decl_export void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
@@ -341,8 +341,8 @@ mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_
 
 mi_decl_nodiscard mi_decl_export unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept;
-mi_decl_export int  mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept;
-mi_decl_export int  mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept;
+mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
 
 mi_decl_export void mi_free_size(void* p, size_t size) mi_attr_noexcept;
 mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
@@ -384,8 +384,8 @@ template<class T> struct mi_stl_allocator {
   typedef value_type const* const_pointer;
   template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
 
-  mi_stl_allocator()                                             mi_attr_noexcept { }
-  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept { }
+  mi_stl_allocator()                                             mi_attr_noexcept = default;
+  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept = default;
   template<class U> mi_stl_allocator(const mi_stl_allocator<U>&) mi_attr_noexcept { }
   mi_stl_allocator  select_on_container_copy_construction() const { return *this; }
   void              deallocate(T* p, size_type) { mi_free(p); }

From bf2eb55ed12ace317fba24c74786c7e8da1253c6 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 1 Feb 2020 17:48:26 -0800
Subject: [PATCH 20/62] reformatting

---
 include/mimalloc.h | 78 ++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index caf71726..346774b7 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -25,50 +25,50 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if (__cplusplus >= 201703)
-#define mi_decl_nodiscard      [[nodiscard]]
+  #define mi_decl_nodiscard    [[nodiscard]]
 #elif (__GNUC__ >= 4)         
-#define mi_decl_nodiscard      __attribute__((warn_unused_result))
+  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
 #elif (_MSC_VER >= 1700)
-#define mi_decl_nodiscard      _Check_return_
+  #define mi_decl_nodiscard    _Check_return_
 #else 
-#define mi_decl_nodiscard 
+  #define mi_decl_nodiscard 
 #endif 
 
 #ifdef _MSC_VER
   #if !defined(MI_SHARED_LIB)
     #define mi_decl_export
   #elif defined(MI_SHARED_LIB_EXPORT)
-    #define mi_decl_export            __declspec(dllexport)
+    #define mi_decl_export              __declspec(dllexport)
   #else
-    #define mi_decl_export            __declspec(dllimport)
+    #define mi_decl_export              __declspec(dllimport)
   #endif
   #if (_MSC_VER >= 1900) && !defined(__EDG__)
-    #define mi_decl_allocator         __declspec(allocator) __declspec(restrict)
+    #define mi_decl_allocator           __declspec(allocator) __declspec(restrict)
   #else
-    #define mi_decl_allocator         __declspec(restrict)
+    #define mi_decl_allocator           __declspec(restrict)
   #endif
-  #define mi_cdecl                    __cdecl
+  #define mi_cdecl                      __cdecl
   #define mi_attr_malloc
   #define mi_attr_alloc_size(s)
   #define mi_attr_alloc_size2(s1,s2)
   #define mi_attr_alloc_align(p)
-#elif defined(__GNUC__)               // includes clang and icc
-  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
-  #define mi_decl_export              __attribute__((visibility("default")))
+#elif defined(__GNUC__)                 // includes clang and icc
+  #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
+  #define mi_decl_export                __attribute__((visibility("default")))
   #define mi_decl_allocator
-  #define mi_attr_malloc              __attribute__((malloc))
+  #define mi_attr_malloc                __attribute__((malloc))
   #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
-  #define mi_attr_alloc_size(s)
-  #define mi_attr_alloc_size2(s1,s2)
-  #define mi_attr_alloc_align(p)
+    #define mi_attr_alloc_size(s)
+    #define mi_attr_alloc_size2(s1,s2)
+    #define mi_attr_alloc_align(p)
   #elif defined(__INTEL_COMPILER)
-  #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
-  #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
-  #define mi_attr_alloc_align(p)
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)
   #else
-  #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
-  #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
-  #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
   #endif
 #else
   #define mi_cdecl
@@ -100,7 +100,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc(size_t count,
 mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc(void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 mi_decl_export mi_decl_allocator void* mi_expand(void* p, size_t newsize)                      mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
-mi_decl_export void  mi_free(void* p) mi_attr_noexcept;
+mi_decl_export void mi_free(void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export char* mi_strdup(const char* s) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export char* mi_strndup(const char* s, size_t n) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept;
@@ -168,6 +168,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(v
 // -------------------------------------------------------------------------------------
 // Heaps: first-class, but can only allocate from the same thread that created it.
 // -------------------------------------------------------------------------------------
+
 struct mi_heap_s;
 typedef struct mi_heap_s mi_heap_t;
 
@@ -259,16 +260,17 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 // deprecated
 mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
 
-#define mi_malloc_tp(tp)        ((tp*)mi_malloc(sizeof(tp)))
-#define mi_zalloc_tp(tp)        ((tp*)mi_zalloc(sizeof(tp)))
-#define mi_calloc_tp(tp,n)      ((tp*)mi_calloc(n,sizeof(tp)))
-#define mi_mallocn_tp(tp,n)     ((tp*)mi_mallocn(n,sizeof(tp)))
-#define mi_reallocn_tp(p,tp,n)  ((tp*)mi_reallocn(p,n,sizeof(tp)))
-#define mi_recalloc_tp(p,tp,n)  ((tp*)mi_recalloc(p,n,sizeof(tp)))
+#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
 
 #define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
 #define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
@@ -307,15 +309,15 @@ typedef enum mi_option_e {
 } mi_option_t;
 
 
-mi_decl_nodiscard mi_decl_export bool  mi_option_is_enabled(mi_option_t option);
-mi_decl_export void  mi_option_enable(mi_option_t option);
-mi_decl_export void  mi_option_disable(mi_option_t option);
-mi_decl_export void  mi_option_set_enabled(mi_option_t option, bool enable);
-mi_decl_export void  mi_option_set_enabled_default(mi_option_t option, bool enable);
+mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option);
+mi_decl_export void mi_option_enable(mi_option_t option);
+mi_decl_export void mi_option_disable(mi_option_t option);
+mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-mi_decl_nodiscard mi_decl_export long  mi_option_get(mi_option_t option);
-mi_decl_export void  mi_option_set(mi_option_t option, long value);
-mi_decl_export void  mi_option_set_default(mi_option_t option, long value);
+mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
+mi_decl_export void mi_option_set(mi_option_t option, long value);
+mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 
 
 // -------------------------------------------------------------------------------------------------------
@@ -349,7 +351,7 @@ mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment)
 mi_decl_export void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept;
 
 // The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
-// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).mi_decl_nodiscard ami_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
 mi_decl_nodiscard mi_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
 mi_decl_nodiscard mi_decl_export void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
 mi_decl_nodiscard mi_decl_export void* mi_new_nothrow(size_t size) mi_attr_malloc mi_attr_alloc_size(1);

From 4a5f3592c064e00ea1378732ab91b6bd2ebcaf04 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 09:25:39 -0800
Subject: [PATCH 21/62] fix build warning on FreeBSD

---
 src/os.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/os.c b/src/os.c
index 6e8c12d8..8427a1b5 100644
--- a/src/os.c
+++ b/src/os.c
@@ -285,6 +285,7 @@ static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int pr
   }
   #else
   UNUSED(try_alignment);
+  UNUSED(mi_os_get_aligned_hint);
   #endif
   if (p==NULL) {
     p = mmap(addr,size,protect_flags,flags,fd,0);

From d2db9f1fc26e9545bcacfb35376ccda473adf803 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 13:12:22 -0800
Subject: [PATCH 22/62] update thread local storage

---
 include/mimalloc-internal.h | 133 +++++++++++++++++++++---------------
 src/init.c                  |  20 ++++--
 2 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 0e3ebed8..0669048e 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -10,18 +10,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc-types.h"
 
-#if defined(MI_MALLOC_OVERRIDE) 
-#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
-#define MI_TLS_OSX_FAST
-#define MI_TLS_OSX_SLOT     94   // seems unused, except in Webkit? See: <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
-#elif defined(__APPLE__)
-#include <pthread.h>
-#define MI_TLS_PTHREADS
-#elif (defined(__OpenBSD__) || defined(__DragonFly__))
-#define MI_TLS_RECURSE_GUARD
-#endif
-#endif
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
@@ -284,47 +272,53 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 ----------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
-#if defined(MI_TLS_OSX_FAST)
-#define MI_TLS_OSX_OFFSET  (MI_TLS_OSX_SLOT*sizeof(void*))
-static inline void* mi_tls_osx_fast_get(void) {
-  void* ret;
-  __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET)));
-  return ret;
+#if defined(MI_MALLOC_OVERRIDE) 
+// On some systems, MacOSX, OpenBSD, and DragonFly, accessing a thread local variable leads to recursion
+// as the access invokes malloc. We avoid this by stealing a TLS slot from the OS internal slots so no
+// allocation is involved. On OSX we use the direct TLS slots, while on the BSD's we use space in the `pthread_t` structure.
+#if defined(__MACH__) // OSX
+#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+                                      // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+#elif defined(__OpenBSD__) 
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#elif defined(__DragonFly__)
+#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot);   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
-static inline void mi_tls_osx_fast_set(void* value) {
-  __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value));
-}
-#elif defined(MI_TLS_PTHREADS)
-extern pthread_key_t  _mi_heap_default_key;
+#elif defined(MI_TLS_PTHREAD)
+extern pthread_key_t _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_OSX_FAST) 
-  // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc).
-  // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) <https://github.com/apportable/Foundation/blob/master/System/System/src/pthread_machdep.h>
-  // which seems unused except for the more recent Webkit <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
-  // Use with care.
-  mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get();
+#if defined(MI_TLS_SLOT) 
+  // Use steal a fixed slot in the TLS on MacOSX to avoid recursion (since the loader calls malloc).
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREADS)
-  // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc` 
-  // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed.
-  // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good).
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? (mi_heap_t*)&_mi_heap_empty : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
   #if defined(MI_TLS_RECURSE_GUARD)
-  // On some BSD platforms, like openBSD, the dynamic loader calls `malloc`
-  // to initialize thread local data (before our module is loaded). 
   // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
   // until our module is loaded and use the statically allocated main heap until that time.
   // TODO: patch ourselves dynamically to avoid this check every time?
-  if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main;
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
 #endif
@@ -344,6 +338,7 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }
 
 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
   mi_assert_internal(_mi_heap_main.cookie != 0);
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
@@ -669,24 +664,54 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
 }
-#elif (defined(__GNUC__) || defined(__clang__)) && \
+
+#elif defined(__GNUC__) && \
       (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
-// TLS register on x86 is in the FS or GS register
-// see: https://akkadia.org/drepper/tls.pdf
+
+// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  res = tcb[slot];
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  res = tcb[slot];
+#endif
+  return res;
+}
+
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  tcb[slot] = value;
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  tcb[slot] = value;
+#endif
+}
+
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  uintptr_t tid;
-  #if defined(__i386__)
-  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
-  #elif defined(__MACH__)
-  __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__)
-  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
-  #elif defined(__aarch64__)
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
-  #endif
-  return tid;
+  // normally, slot 0 is the pointer to the thread control block
+  return (uintptr_t)mi_tls_slot(0);
 }
 #else
 // otherwise use standard C
diff --git a/src/init.c b/src/init.c
index 960cccf1..f59daa9e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -107,6 +107,8 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
 
+extern mi_heap_t _mi_heap_main;
+
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
@@ -146,6 +148,11 @@ static void mi_heap_main_init(void) {
   }
 }
 
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
 
 /* -----------------------------------------------------------
   Initialization and freeing of the thread local heaps
@@ -333,9 +340,11 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
-  #if defined(MI_TLS_OSX_FAST)
-  mi_tls_osx_fast_set(heap);
-  #elif defined(MI_TLS_PTHREADS)
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
   // we use _mi_heap_default_key
   #else
   _mi_heap_default = heap;
@@ -406,13 +415,16 @@ static void mi_allocator_done() {
 
 // Called once by the process loader
 static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
+  #endif
   os_preloading = false;  
   atexit(&mi_process_done);
   _mi_options_init();  
   mi_process_init();
-  //mi_stats_reset();
+  //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)

From 8bc20631e47b8c0ec79efb5f2452e958bffb4558 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 13:25:26 -0800
Subject: [PATCH 23/62] fixes for freeBSD

---
 include/mimalloc-internal.h | 4 +++-
 src/init.c                  | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 0669048e..cfbdc9ca 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -292,11 +292,13 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #if defined(MI_TLS_SLOT)
 static inline void* mi_tls_slot(size_t slot);   // forward declaration
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+#include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
+#include <pthread.h>
 extern pthread_key_t _mi_heap_default_key;
 #else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
@@ -308,7 +310,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  mi_heap_t* heap = mi_tls_pthread_heap_slot();
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD)
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
diff --git a/src/init.c b/src/init.c
index f59daa9e..b7f329cb 100644
--- a/src/init.c
+++ b/src/init.c
@@ -168,7 +168,7 @@ typedef struct mi_thread_data_s {
 static bool _mi_heap_init(void) {
   if (mi_heap_is_initialized(mi_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
-    mi_assert_internal(_mi_heap_main.thread_id != 0);
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
     _mi_heap_set_default_direct(&_mi_heap_main);
@@ -358,8 +358,9 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
     mi_assert_internal(mi_fls_key != 0);
     FlsSetValue(mi_fls_key, heap);
   #elif defined(MI_USE_PTHREADS)
-    mi_assert_internal(_mi_heap_default_key != (pthread_key_t)(-1)); 
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
     pthread_setspecific(_mi_heap_default_key, heap);
+  }
   #endif
 }
 

From 07fbe4f80f04a417bb19ac83113f73e1d1db3393 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 14:31:28 -0800
Subject: [PATCH 24/62] fixes for dragonfly

---
 include/mimalloc-internal.h | 7 +++++++
 src/options.c               | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index cfbdc9ca..b11cb5fe 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -285,6 +285,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #elif defined(__OpenBSD__) 
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
 #elif defined(__DragonFly__)
+#warning "mimalloc is not working correctly on DragonFly yet."
 #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
 #endif
 #endif
@@ -295,6 +296,12 @@ static inline void* mi_tls_slot(size_t slot);   // forward declaration
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    static mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif  
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
diff --git a/src/options.c b/src/options.c
index ec58c31c..0af4a485 100644
--- a/src/options.c
+++ b/src/options.c
@@ -70,7 +70,11 @@ static mi_option_desc_t options[_mi_option_last] =
   { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+#if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#else
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#endif
   { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose

From 865965b8c0c83674018be95b9bfdd65a4d2f7d2e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 15:43:13 -0800
Subject: [PATCH 25/62] fix warnings under clang-cl

---
 src/alloc-posix.c | 1 -
 src/os.c          | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index 505e42e4..234c39a3 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi prefixed publi definitions of various Posix, Unix, and C++ functions
 // for convenience and used when overriding these functions.
 // ------------------------------------------------------------------------
-
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 
diff --git a/src/os.c b/src/os.c
index 8427a1b5..aa49400d 100644
--- a/src/os.c
+++ b/src/os.c
@@ -192,7 +192,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
   if (was_committed) _mi_stat_decrease(&stats->committed, size);
   _mi_stat_decrease(&stats->reserved, size);
   if (err) {
-#pragma warning(suppress:4996)
+    #pragma warning(suppress:4996)
     _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
     return false;
   }
@@ -215,9 +215,9 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
 #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
+    MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
     reqs.Alignment = try_alignment;
-    MEM_EXTENDED_PARAMETER param = { 0 };
+    MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
     param.Type = MemExtendedParameterAddressRequirements;
     param.Pointer = &reqs;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
@@ -828,7 +828,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   mi_win_enable_large_os_pages();
 
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
+  MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   static bool mi_huge_pages_available = true;
   if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {

From f5ab2c1c49bfd153db341e68dcb86fe045bec445 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 15:50:02 -0800
Subject: [PATCH 26/62] suppress spurious warnings with clang-cl

---
 ide/vs2017/mimalloc-override.vcxproj    |  8 ++++----
 ide/vs2017/mimalloc-test-stress.vcxproj |  4 ++--
 ide/vs2017/mimalloc.vcxproj             | 20 ++++++++++++++++----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/ide/vs2017/mimalloc-override.vcxproj b/ide/vs2017/mimalloc-override.vcxproj
index 4225a2f9..f828ba97 100644
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@@ -95,7 +95,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>Default</CompileAs>
@@ -123,7 +123,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>Default</CompileAs>
@@ -152,7 +152,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
@@ -184,7 +184,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
diff --git a/ide/vs2017/mimalloc-test-stress.vcxproj b/ide/vs2017/mimalloc-test-stress.vcxproj
index 325ba3ff..b8267d0b 100644
--- a/ide/vs2017/mimalloc-test-stress.vcxproj
+++ b/ide/vs2017/mimalloc-test-stress.vcxproj
@@ -149,8 +149,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/ide/vs2017/mimalloc.vcxproj b/ide/vs2017/mimalloc.vcxproj
index e08deec4..fa236d64 100644
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@@ -90,6 +90,18 @@
     <TargetExt>.lib</TargetExt>
     <TargetName>mimalloc-static</TargetName>
   </PropertyGroup>
+  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <UseLlvmLib>false</UseLlvmLib>
+  </PropertyGroup>
+  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <UseLlvmLib>false</UseLlvmLib>
+  </PropertyGroup>
+  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <UseLlvmLib>false</UseLlvmLib>
+  </PropertyGroup>
+  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <UseLlvmLib>false</UseLlvmLib>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -97,7 +109,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp17</LanguageStandard>
@@ -116,7 +128,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp17</LanguageStandard>
@@ -143,7 +155,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
@@ -170,7 +182,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>

From 12c4108abe44ac5e084e9d12ee4dba8c7718ba24 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 16:09:09 -0800
Subject: [PATCH 27/62] update comments

---
 include/mimalloc-internal.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index b11cb5fe..75aea2e2 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -267,18 +267,25 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }
 
 
-/* -----------------------------------------------------------
-  The thread local default heap
------------------------------------------------------------ */
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` return the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a 
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks). 
+On some platforms though we cannot use that when overriding `malloc` since the underlying 
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. 
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: not yet working.
+------------------------------------------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
 extern bool _mi_process_is_initialized;
 mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
 #if defined(MI_MALLOC_OVERRIDE) 
-// On some systems, MacOSX, OpenBSD, and DragonFly, accessing a thread local variable leads to recursion
-// as the access invokes malloc. We avoid this by stealing a TLS slot from the OS internal slots so no
-// allocation is involved. On OSX we use the direct TLS slots, while on the BSD's we use space in the `pthread_t` structure.
 #if defined(__MACH__) // OSX
 #define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
@@ -313,7 +320,6 @@ extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate
 
 static inline mi_heap_t* mi_get_default_heap(void) {
 #if defined(MI_TLS_SLOT) 
-  // Use steal a fixed slot in the TLS on MacOSX to avoid recursion (since the loader calls malloc).
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
@@ -323,10 +329,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
-  #if defined(MI_TLS_RECURSE_GUARD)
-  // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` 
-  // until our module is loaded and use the statically allocated main heap until that time.
-  // TODO: patch ourselves dynamically to avoid this check every time?
+  #if defined(MI_TLS_RECURSE_GUARD)  
   if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
@@ -662,9 +665,8 @@ static inline size_t _mi_os_numa_node_count(void) {
 
 
 // -------------------------------------------------------------------
-// Getting the thread id should be performant
-// as it is called in the fast path of `_mi_free`,
-// so we specialize for various platforms.
+// Getting the thread id should be performant as it is called in the 
+// fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -699,6 +701,7 @@ static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
   return res;
 }
 
+// setting is only used on macOSX for now
 static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
   const size_t ofs = (slot*sizeof(void*));
 #if defined(__i386__)
@@ -719,7 +722,7 @@ static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
 }
 
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  // normally, slot 0 is the pointer to the thread control block
+  // in all our targets, slot 0 is the pointer to the thread control block
   return (uintptr_t)mi_tls_slot(0);
 }
 #else

From f0dc6e7e42e7e7a45d62ba96da014c5f8e568a10 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 16:21:06 -0800
Subject: [PATCH 28/62] add extra alignment test

---
 src/alloc-aligned.c | 2 +-
 test/test-api.c     | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 05dd5fc6..c4c29ee8 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -19,9 +19,9 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   // the address at offset is aligned regardless of the allocated size.
   mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
 
-  if (alignment <= MI_MAX_ALIGN_SIZE && offset==0) return _mi_heap_malloc_zero(heap, size, zero);
   if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
   if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+  if (alignment <= MI_MAX_ALIGN_SIZE && offset==0) return _mi_heap_malloc_zero(heap, size, zero);
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
diff --git a/test/test-api.c b/test/test-api.c
index 95891754..2d26e14d 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -140,6 +140,13 @@ int main() {
   CHECK_BODY("malloc-aligned2", {
     void* p = mi_malloc_aligned(48,32); result = (p != NULL && (uintptr_t)(p) % 32 == 0); mi_free(p);
   });
+  CHECK_BODY("malloc-aligned3", {
+    void* p1 = mi_malloc_aligned(48,32); bool result1 = (p1 != NULL && (uintptr_t)(p1) % 32 == 0); 
+    void* p2 = mi_malloc_aligned(48,32); bool result2 = (p2 != NULL && (uintptr_t)(p2) % 32 == 0);
+    mi_free(p2);
+    mi_free(p1);
+    result = (result1&&result2);
+  });
   CHECK_BODY("malloc-aligned-at1", {
     void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p);
   });

From 757dcc84115eeccb93ff23e177851c6d0d88f8ea Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sun, 2 Feb 2020 19:07:26 -0800
Subject: [PATCH 29/62] extend interpose for macOSX

---
 include/mimalloc-internal.h | 24 ++++++++++++------------
 src/alloc-override.c        | 17 +++++++++--------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 75aea2e2..37722cd9 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -269,11 +269,11 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 
 /* ----------------------------------------------------------------------------------------
 The thread local default heap: `_mi_get_default_heap` return the thread local heap.
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a 
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
 __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks). 
-On some platforms though we cannot use that when overriding `malloc` since the underlying 
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. 
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
 We try to circumvent this in an efficient way:
 - macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
            loader itself calls `malloc` even before the modules are initialized.
@@ -285,11 +285,11 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern bool _mi_process_is_initialized;
 mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
 
-#if defined(MI_MALLOC_OVERRIDE) 
+#if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#define MI_TLS_SLOT               84  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
-#elif defined(__OpenBSD__) 
+#elif defined(__OpenBSD__)
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
 #elif defined(__DragonFly__)
 #warning "mimalloc is not working correctly on DragonFly yet."
@@ -299,7 +299,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_TLS_SLOT)
 static inline void* mi_tls_slot(size_t slot);   // forward declaration
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS) 
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
   pthread_t self = pthread_self();
@@ -308,7 +308,7 @@ static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
     static mi_heap_t* pheap_main = _mi_heap_main_get();
     return &pheap_main;
   }
-  #endif  
+  #endif
   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
 }
 #elif defined(MI_TLS_PTHREAD)
@@ -319,7 +319,7 @@ extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate
 #endif
 
 static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_SLOT) 
+#if defined(MI_TLS_SLOT)
   mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
@@ -329,7 +329,7 @@ static inline mi_heap_t* mi_get_default_heap(void) {
   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
 #else
-  #if defined(MI_TLS_RECURSE_GUARD)  
+  #if defined(MI_TLS_RECURSE_GUARD)
   if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
   #endif
   return _mi_heap_default;
@@ -665,7 +665,7 @@ static inline size_t _mi_os_numa_node_count(void) {
 
 
 // -------------------------------------------------------------------
-// Getting the thread id should be performant as it is called in the 
+// Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 58996c5f..c0fdf161 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -41,26 +41,27 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
-  static void mi_free_tls_safe(void* p) {
-    if (mi_unlikely(_mi_preloading())) return;
-    mi_free(p);
-  }
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
     const void* replacement;
     const void* target;
   };
-  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
-  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
   __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSEX(free,mi_free_tls_safe),
     MI_INTERPOSE_MI(strdup),
-    MI_INTERPOSE_MI(strndup)
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
   };
 #elif defined(_MSC_VER)
   // cannot override malloc unless using a dll.

From f3c47c7c91801c712db08d6944503132defef039 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sun, 2 Feb 2020 21:03:09 -0800
Subject: [PATCH 30/62] improved malloc zone handling on macOSX (not working
 yet)

---
 include/mimalloc-internal.h |  2 +-
 src/alloc-override-osx.c    | 24 +++++++++++++++++++++++-
 src/alloc-override.c        | 14 +++++++-------
 src/alloc.c                 | 16 ++++++++--------
 src/init.c                  |  8 ++++----
 test/test-stress.c          |  2 +-
 6 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 37722cd9..4ac7da78 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -298,7 +298,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #endif
 
 #if defined(MI_TLS_SLOT)
-static inline void* mi_tls_slot(size_t slot);   // forward declaration
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
 #include <pthread.h>
 static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index d4f8b06d..ed0bc2de 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -14,6 +14,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should only be included on macOS"
 #endif
 
+#warning "malloc zones do not seem to work for now; use MI_INTERPOSE instead"
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
@@ -35,34 +36,42 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im
 ------------------------------------------------------ */
 
 static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  UNUSED(zone); UNUSED(p);
   return 0; // as we cannot guarantee that `p` comes from us, just return 0
 }
 
 static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_malloc(size);
 }
 
 static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  UNUSED(zone);
   return mi_calloc(count, size);
 }
 
 static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_malloc_aligned(size, _mi_os_page_size());
 }
 
 static void zone_free(malloc_zone_t* zone, void* p) {
+  UNUSED(zone);
   return mi_free(p);
 }
 
 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  UNUSED(zone);
   return mi_realloc(p, newsize);
 }
 
 static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  UNUSED(zone);
   return mi_malloc_aligned(size,alignment);
 }
 
 static void zone_destroy(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo: ignore for now?
 }
 
@@ -83,11 +92,13 @@ static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
 }
 
 static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone); UNUSED(size);
   mi_collect(false);
   return 0;
 }
 
 static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  UNUSED(size);
   zone_free(zone,p);
 }
 
@@ -102,34 +113,43 @@ static kern_return_t intro_enumerator(task_t task, void* p,
                             vm_range_recorder_t recorder)
 {
   // todo: enumerate all memory
+  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
+  UNUSED(reader); UNUSED(recorder);
   return KERN_SUCCESS;
 }
 
 static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
   return mi_good_size(size);
 }
 
 static boolean_t intro_check(malloc_zone_t* zone) {
+  UNUSED(zone);
   return true;
 }
 
 static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  UNUSED(zone); UNUSED(verbose);
   mi_stats_print(NULL);
 }
 
 static void intro_log(malloc_zone_t* zone, void* p) {
+  UNUSED(zone); UNUSED(p);
   // todo?
 }
 
 static void intro_force_lock(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo?
 }
 
 static void intro_force_unlock(malloc_zone_t* zone) {
+  UNUSED(zone);
   // todo?
 }
 
 static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  UNUSED(zone);
   // todo...
   stats->blocks_in_use = 0;
   stats->size_in_use = 0;
@@ -138,6 +158,7 @@ static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
 }
 
 static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  UNUSED(zone);
   return false;
 }
 
@@ -161,7 +182,6 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -201,6 +221,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   zone.free_definite_size = &zone_free_definite_size;
   zone.pressure_relief = &zone_pressure_relief;
   intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;
 
   // force the purgeable zone to exist to avoid strange bugs
   if (malloc_default_purgeable_zone) {
@@ -225,6 +246,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
     malloc_zone_unregister(purgeable_zone);
     malloc_zone_register(purgeable_zone);
   }
+
 }
 
 #endif // MI_MALLOC_OVERRIDE
diff --git a/src/alloc-override.c b/src/alloc-override.c
index c0fdf161..151c2333 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32) || (defined(__MACH__) && !defined(MI_INTERPOSE)))
 
 // ------------------------------------------------------
 // Override system malloc
@@ -68,10 +68,10 @@ terms of the MIT license. A copy of the license can be found in the file
   // we just override new/delete which does work in a static library.
 #else
   // On all other systems forward to our API
-  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size);
-  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n);
-  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize);
-  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p);
+  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size);
+  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n);
+  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize);
+  void  free(void* p)                    MI_FORWARD0(mi_free, p);
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
@@ -99,8 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file
   void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
-  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
+  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
   #endif
 
   #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
diff --git a/src/alloc.c b/src/alloc.c
index 61f34353..d2fbe4b1 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -212,7 +212,7 @@ static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* bl
   size_t delta;
   bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
   mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0); 
+  return (ok ? bsize - delta : 0);
 }
 
 static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
@@ -259,7 +259,7 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
   mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
   padding->delta = (uint32_t)new_delta;
 }
-#else 
+#else
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   UNUSED(page);
   UNUSED(block);
@@ -359,7 +359,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
     }
     else if (mi_unlikely(mi_page_is_in_full(page))) {
       _mi_page_unfull(page);
-    }    
+    }
   }
   else {
     _mi_free_block_mt(page,block);
@@ -401,7 +401,7 @@ void mi_free(void* p) mi_attr_noexcept
       "(this may still be a valid very large allocation (over 64MiB))\n", p);
     if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
       _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+    } 
   }
 #endif
 #if (MI_DEBUG!=0 || MI_SECURE>=4)
@@ -421,11 +421,11 @@ void mi_free(void* p) mi_attr_noexcept
   mi_heap_stat_decrease(heap, malloc, bsize);
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
     mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
-  }  
+  }
 #endif
 
   if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned    
+    // local, and not full or aligned
     if (mi_unlikely(mi_check_is_double_free(page,block))) return;
     mi_check_padding(page, block);
     #if (MI_DEBUG!=0)
@@ -436,7 +436,7 @@ void mi_free(void* p) mi_attr_noexcept
     page->used--;
     if (mi_unlikely(mi_page_all_free(page))) {
       _mi_page_retire(page);
-    }    
+    }
   }
   else {
     // non-local, aligned blocks, or a full page; use the more generic path
@@ -473,7 +473,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
   const mi_segment_t* const segment = _mi_ptr_segment(p);
   const mi_page_t* const page = _mi_segment_page_of(segment, p);
   const mi_block_t* const block = (const mi_block_t*)p;
-  const size_t size = mi_page_usable_size_of(page, block);  
+  const size_t size = mi_page_usable_size_of(page, block);
   if (mi_unlikely(mi_page_has_aligned(page))) {
     ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
     mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
diff --git a/src/init.c b/src/init.c
index b7f329cb..2f5ca224 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
 
 #if defined(MI_PADDING) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
-#elif defined(MI_PADDING) 
+#elif defined(MI_PADDING)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
@@ -190,7 +190,7 @@ static bool _mi_heap_init(void) {
     heap->cookie  = _mi_heap_random_next(heap) | 1;
     heap->keys[0] = _mi_heap_random_next(heap);
     heap->keys[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
@@ -421,9 +421,9 @@ static void mi_process_load(void) {
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
   UNUSED(dummy);
   #endif
-  os_preloading = false;  
+  os_preloading = false;
   atexit(&mi_process_done);
-  _mi_options_init();  
+  _mi_options_init();
   mi_process_init();
   //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
diff --git a/test/test-stress.c b/test/test-stress.c
index 7d8993a0..f1c8b2e1 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifdef USE_STD_MALLOC
+#ifndef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)

From feb0699bcb3f81cb14964ff8e3d92788241b1cd0 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:01:04 -0800
Subject: [PATCH 31/62] fix aligment check when padding is enabled

---
 ide/vs2019/mimalloc-override.vcxproj.filters | 8 ++++----
 src/alloc-aligned.c                          | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ide/vs2019/mimalloc-override.vcxproj.filters b/ide/vs2019/mimalloc-override.vcxproj.filters
index 83d6f7fe..8e36f50e 100644
--- a/ide/vs2019/mimalloc-override.vcxproj.filters
+++ b/ide/vs2019/mimalloc-override.vcxproj.filters
@@ -1,9 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
-    <ClCompile Include="..\..\src\options.c">
-      <Filter>Header Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\alloc.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -49,6 +46,9 @@
     <ClCompile Include="..\..\src\random.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
@@ -78,4 +78,4 @@
       <UniqueIdentifier>{39cb7e38-69d0-43fb-8406-6a0f7cefc3b4}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index c4c29ee8..40362068 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -25,15 +25,16 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
+  const size_t padsize = size + MI_PADDING_SIZE;
+  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap,padsize);
     const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
     if (mi_likely(page->free != NULL && is_aligned))
     {
       #if MI_STAT>1
       mi_heap_stat_increase( heap, malloc, size);
       #endif
-      void* p = _mi_page_malloc(heap,page,size + MI_PADDING_SIZE); // TODO: inline _mi_page_malloc
+      void* p = _mi_page_malloc(heap,page,padsize); // TODO: inline _mi_page_malloc
       mi_assert_internal(p != NULL);
       mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
       if (zero) _mi_block_zero_init(page,p,size);
@@ -42,7 +43,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   }
 
   // use regular allocation if it is guaranteed to fit the alignment constraints
-  if (offset==0 && alignment<=size && size<=MI_MEDIUM_OBJ_SIZE_MAX && (size&align_mask)==0) {
+  if (offset==0 && alignment<=padsize && padsize<=MI_MEDIUM_OBJ_SIZE_MAX && (padsize&align_mask)==0) {
     void* p = _mi_heap_malloc_zero(heap, size, zero);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
     return p;

From 1c2e0a47cada2cd689f34db18b28ca41a53cc1f6 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:04:53 -0800
Subject: [PATCH 32/62] fix noexcept attribute on array delete operators

---
 include/mimalloc-new-delete.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-new-delete.h b/include/mimalloc-new-delete.h
index 050f9433..fded0c04 100644
--- a/include/mimalloc-new-delete.h
+++ b/include/mimalloc-new-delete.h
@@ -32,8 +32,8 @@ terms of the MIT license. A copy of the license can be found in the file
   void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
 
   #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) { mi_free_size(p,n); };
-  void operator delete[](void* p, std::size_t n) { mi_free_size(p,n); };
+  void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
+  void operator delete[](void* p, std::size_t n) noexcept { mi_free_size(p,n); };
   #endif
 
   #if (__cplusplus > 201402L || defined(__cpp_aligned_new))

From b241be7075c32bd3952f4d9f7eb22c6531b8397e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:08:33 -0800
Subject: [PATCH 33/62] reenable mimalloc in the stress test

---
 test/test-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index f1c8b2e1..7d8993a0 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?
 
 
-#ifndef USE_STD_MALLOC
+#ifdef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)

From 3560e0a867a82b6a593a01ac4995c11498f0a167 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:15:09 -0800
Subject: [PATCH 34/62] fix TLS slot number on OSX

---
 include/mimalloc-internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 4ac7da78..b2297c50 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -268,7 +268,7 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 
 
 /* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_get_default_heap` return the thread local heap.
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
 On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
 __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
 that the storage will always be available (allocated on the thread stacks).
@@ -287,7 +287,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               84  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
                                       // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
 #elif defined(__OpenBSD__)
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>

From a96e94f940db7d844030239bfbedd004d5915657 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 2 Feb 2020 22:46:38 -0800
Subject: [PATCH 35/62] change TLS slot on OpenBSD

---
 include/mimalloc-internal.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index b2297c50..cea6b9c3 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -287,10 +287,13 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__MACH__) // OSX
-#define MI_TLS_SLOT               89  // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-                                      // possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+#define MI_TLS_SLOT               89  // seems unused? 
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
 #elif defined(__OpenBSD__)
-#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
 #elif defined(__DragonFly__)
 #warning "mimalloc is not working correctly on DragonFly yet."
 #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>

From e67606210326c838b8fa3004a83721df4d3c6dbe Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 5 Feb 2020 17:40:13 -0800
Subject: [PATCH 36/62] update mac zone code

---
 src/alloc-override-osx.c | 67 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index ed0bc2de..99c6a134 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -182,6 +182,72 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
+// directly overwrite the default zone as per:
+// <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
+
+static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
+{
+  static malloc_introspection_t intro;
+  memset(&intro, 0, sizeof(intro));
+
+  intro.enumerator = &intro_enumerator;
+  intro.good_size = &intro_good_size;
+  intro.check = &intro_check;
+  intro.print = &intro_print;
+  intro.log = &intro_log;
+  intro.force_lock = &intro_force_lock;
+  intro.force_unlock = &intro_force_unlock;
+
+  static malloc_zone_t oldzone;
+  static malloc_zone_t* zone = malloc_default_zone(); // get the `malloc` backing default zone
+  if (zone == NULL) return;
+
+  // save the default zone in oldzone
+  memset(&oldzone, 0, sizeof(oldzone));
+  if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
+
+  // overwrite default zone functions in-place
+  zone->zone_name = "mimalloc";
+  zone->size = &zone_size;
+  zone->introspect = &intro;
+  zone->malloc = &zone_malloc;
+  zone->calloc = &zone_calloc;
+  zone->valloc = &zone_valloc;
+  zone->free = &zone_free;
+  zone->realloc = &zone_realloc;
+  zone->destroy = &zone_destroy;
+  zone->batch_malloc = &zone_batch_malloc;
+  zone->batch_free = &zone_batch_free;
+
+  malloc_zone_t* purgeable_zone = NULL;
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // switch to version 9 on OSX 10.6 to support memalign.
+  // zone->version = 9;
+  zone->memalign = &zone_memalign;
+  zone->free_definite_size = &zone_free_definite_size;
+  zone->pressure_relief = &zone_pressure_relief;
+  intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;
+  /*
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+  */
+#endif
+  /*
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+  */
+}
+
+/*
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -248,5 +314,6 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
+*/
 
 #endif // MI_MALLOC_OVERRIDE

From 9062f397649da3b4851d9107cc5a2b01021faff5 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Sat, 8 Feb 2020 20:08:52 -0800
Subject: [PATCH 37/62] enable interpose separate from zones on macOS

---
 CMakeLists.txt           | 16 +++++++++++-----
 src/alloc-override-osx.c | 20 ++++++++++++++++----
 src/alloc-override.c     |  2 +-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2da7974b..e16830aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,11 +5,12 @@ set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
 
 option(MI_OVERRIDE          "Override the standard malloc interface" ON)
-option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
 option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode" OFF)
 option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
+option(MI_INTERPOSE         "Use interpose to override standard malloc on macOS" ON)
+option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" OFF) # enables interpose as well
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
@@ -61,14 +62,19 @@ endif()
 if(MI_OVERRIDE MATCHES "ON")
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
+    if(MI_OSX_ZONE MATCHES "ON")
+      # use zone's on macOS
+      message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
+      list(APPEND mi_sources src/alloc-override-osx.c)
+      if(NOT MI_INTERPOSE MATCHES "ON")
+        message(STATUS "  (enabling INTERPOSE as well since zone's require this)")
+        set(MI_INTERPOSE "ON")
+      endif()
+    endif()
     if(MI_INTERPOSE MATCHES "ON")
       # use interpose on macOS
       message(STATUS "  Use interpose to override malloc (MI_INTERPOSE=ON)")
       list(APPEND mi_defines MI_INTERPOSE)
-    else()
-      # use zone's on macOS
-      message(STATUS "  Use zone's to override malloc (MI_INTERPOSE=OFF)")
-      list(APPEND mi_sources src/alloc-override-osx.c)
     endif()
   endif()
 endif()
diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index 99c6a134..92d5ce2b 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -14,7 +14,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should only be included on macOS"
 #endif
 
-#warning "malloc zones do not seem to work for now; use MI_INTERPOSE instead"
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
@@ -182,8 +181,10 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
+#if 0
 // directly overwrite the default zone as per:
 // <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
+#include <mach/mach.h>
 
 static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
 {
@@ -199,13 +200,18 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   intro.force_unlock = &intro_force_unlock;
 
   static malloc_zone_t oldzone;
-  static malloc_zone_t* zone = malloc_default_zone(); // get the `malloc` backing default zone
+  static malloc_zone_t* zone;
+  zone = mi_get_default_zone(); // get the `malloc` backing default zone
   if (zone == NULL) return;
 
   // save the default zone in oldzone
   memset(&oldzone, 0, sizeof(oldzone));
   if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
 
+  if (zone->version >= 8) {
+    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
+               VM_PROT_READ|VM_PROT_WRITE);
+  }
   // overwrite default zone functions in-place
   zone->zone_name = "mimalloc";
   zone->size = &zone_size;
@@ -237,6 +243,11 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   }
   */
 #endif
+  if (zone->version >= 8) {
+    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
+               VM_PROT_READ);
+  }
+
   /*
   // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
   // earlier than the default zone.
@@ -247,7 +258,8 @@ static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
   */
 }
 
-/*
+#else
+
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -314,6 +326,6 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
-*/
+#endif
 
 #endif // MI_MALLOC_OVERRIDE
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 151c2333..c0e7bc2b 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32) || (defined(__MACH__) && !defined(MI_INTERPOSE)))
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__MACH__) && !defined(MI_INTERPOSE)))
 
 // ------------------------------------------------------
 // Override system malloc

From afe434463ac92bc140691c55c3922a53f4324bfb Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 9 Feb 2020 18:26:50 -0800
Subject: [PATCH 38/62] add comments on overriding in macOSX

---
 src/alloc-override-osx.c | 86 +++-------------------------------------
 1 file changed, 6 insertions(+), 80 deletions(-)

diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c
index 92d5ce2b..cc03f5e2 100644
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@@ -17,6 +17,12 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ------------------------------------------------------
    Override system malloc on macOS
    This is done through the malloc zone interface.
+   It seems we also need to interpose (see `alloc-override.c`)
+   or otherwise we get zone errors as there are usually 
+   already allocations done by the time we take over the 
+   zone. Unfortunately, that means we need to replace
+   the `free` with a checked free (`cfree`) impacting 
+   performance.
 ------------------------------------------------------ */
 
 #include <AvailabilityMacros.h>
@@ -181,85 +187,6 @@ static malloc_zone_t* mi_get_default_zone()
   }
 }
 
-#if 0
-// directly overwrite the default zone as per:
-// <https://lists.apple.com/archives/darwin-dev/2005/Apr/msg00050.html>
-#include <mach/mach.h>
-
-static void __attribute__((constructor)) _mi_macos_override_malloc_direct()
-{
-  static malloc_introspection_t intro;
-  memset(&intro, 0, sizeof(intro));
-
-  intro.enumerator = &intro_enumerator;
-  intro.good_size = &intro_good_size;
-  intro.check = &intro_check;
-  intro.print = &intro_print;
-  intro.log = &intro_log;
-  intro.force_lock = &intro_force_lock;
-  intro.force_unlock = &intro_force_unlock;
-
-  static malloc_zone_t oldzone;
-  static malloc_zone_t* zone;
-  zone = mi_get_default_zone(); // get the `malloc` backing default zone
-  if (zone == NULL) return;
-
-  // save the default zone in oldzone
-  memset(&oldzone, 0, sizeof(oldzone));
-  if (zone->version >= 9) memcpy(&oldzone, zone, sizeof(oldzone));
-
-  if (zone->version >= 8) {
-    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
-               VM_PROT_READ|VM_PROT_WRITE);
-  }
-  // overwrite default zone functions in-place
-  zone->zone_name = "mimalloc";
-  zone->size = &zone_size;
-  zone->introspect = &intro;
-  zone->malloc = &zone_malloc;
-  zone->calloc = &zone_calloc;
-  zone->valloc = &zone_valloc;
-  zone->free = &zone_free;
-  zone->realloc = &zone_realloc;
-  zone->destroy = &zone_destroy;
-  zone->batch_malloc = &zone_batch_malloc;
-  zone->batch_free = &zone_batch_free;
-
-  malloc_zone_t* purgeable_zone = NULL;
-
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // switch to version 9 on OSX 10.6 to support memalign.
-  // zone->version = 9;
-  zone->memalign = &zone_memalign;
-  zone->free_definite_size = &zone_free_definite_size;
-  zone->pressure_relief = &zone_pressure_relief;
-  intro.zone_locked = &intro_zone_locked;
-  intro.statistics = &intro_statistics;
-  /*
-  // force the purgeable zone to exist to avoid strange bugs
-  if (malloc_default_purgeable_zone) {
-    purgeable_zone = malloc_default_purgeable_zone();
-  }
-  */
-#endif
-  if (zone->version >= 8) {
-    vm_protect(mach_task_self(), (uintptr_t)zone, sizeof(*zone), 0,
-               VM_PROT_READ);
-  }
-
-  /*
-  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
-  // earlier than the default zone.
-  if (purgeable_zone != NULL) {
-    malloc_zone_unregister(purgeable_zone);
-    malloc_zone_register(purgeable_zone);
-  }
-  */
-}
-
-#else
-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
   static malloc_introspection_t intro;
@@ -326,6 +253,5 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
   }
 
 }
-#endif
 
 #endif // MI_MALLOC_OVERRIDE

From 5ba87e56c94a83db919be33fe5449bebc39e9d3e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 9 Feb 2020 18:32:09 -0800
Subject: [PATCH 39/62] update readme for 1.5 release

---
 readme.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/readme.md b/readme.md
index baac2a93..8d3d22e0 100644
--- a/readme.md
+++ b/readme.md
@@ -10,7 +10,7 @@
 mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
 Initially developed by Daan Leijen for the run-time systems of the
-[Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages. 
+[Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
 Latest release:`v1.4.0` (2020-01-22).
 
 It is a drop-in replacement for `malloc` and can be used in other programs
@@ -47,7 +47,7 @@ It also has an easy way to override the allocator in [Windows](#override_on_wind
 - __fast__: In our benchmarks (see [below](#performance)),
   _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
   and usually uses less memory (up to 25% more in the worst case). A nice property
-  is that it does consistently well over a wide range of benchmarks. There is also good huge OS page 
+  is that it does consistently well over a wide range of benchmarks. There is also good huge OS page
   support for larger server programs.
 
 The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
@@ -57,7 +57,8 @@ Enjoy!
 
 ### Releases
 
-* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset, 
+* 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
+* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
 more eager concurrent free, addition of STL allocator, fixed potential memory leak.
 * 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
 free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
@@ -212,13 +213,13 @@ or via environment variables.
    <!--
    - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
    show in the working set even though usually just a small part is committed to physical memory. This is why it
-   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no 
+   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
    real drawbacks and may improve performance by a little.
    -->
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB huge OS pages. This reserves the huge pages at
    startup and can give quite a (latency) performance improvement on long running workloads. Usually it is better to not use
    `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
-   contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at 
+   contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
    Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). With huge OS pages, it may be beneficial to set the setting
    `MIMALLOC_EAGER_COMMIT_DELAY=N` (with usually `N` as 1) to delay the initial `N` segments

From 04f1c3b1e23677ce03bd16137e73089abd552175 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 9 Feb 2020 18:53:39 -0800
Subject: [PATCH 40/62] bump version to v1.6.0

---
 cmake/mimalloc-config-version.cmake | 2 +-
 include/mimalloc.h                  | 2 +-
 readme.md                           | 6 ++++++
 test/CMakeLists.txt                 | 2 +-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 5137be80..6454d91f 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,5 +1,5 @@
 set(mi_version_major 1)
-set(mi_version_minor 5)
+set(mi_version_minor 6)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 346774b7..552a8b2b 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 150   // major + 2 digits minor
+#define MI_MALLOC_VERSION 160   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
diff --git a/readme.md b/readme.md
index 8d3d22e0..56f0430c 100644
--- a/readme.md
+++ b/readme.md
@@ -57,6 +57,12 @@ Enjoy!
 
 ### Releases
 
+* 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
+  and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
+  heap block overflow detection in debug mode (besides the double-free detection and free-list
+  corruption detection). Add `nodiscard` attribute to most allocation functions.
+  Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages
+  for better memory footprint.
 * 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
 * 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
 more eager concurrent free, addition of STL allocator, fixed potential memory leak.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ce077d14..4152f99d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,7 +13,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 # Import mimalloc (if installed)
-find_package(mimalloc 1.5 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
+find_package(mimalloc 1.6 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
 message(STATUS "Found mimalloc installed at: ${MIMALLOC_TARGET_DIR}")
 
 # overriding with a dynamic library

From 0a77b7423f5beb4fb88def78cae84cdb368f0c8c Mon Sep 17 00:00:00 2001
From: Daan <daan@microsoft.com>
Date: Sun, 9 Feb 2020 19:12:19 -0800
Subject: [PATCH 41/62] Update readme.md

---
 readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 56f0430c..e4e96ba7 100644
--- a/readme.md
+++ b/readme.md
@@ -11,7 +11,7 @@ mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
 Initially developed by Daan Leijen for the run-time systems of the
 [Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
-Latest release:`v1.4.0` (2020-01-22).
+Latest release:`v1.6.0` (2020-02-09).
 
 It is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:

From 9749c83ca0aa5b540a3cb4e901e471aa64423255 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 09:16:41 -0800
Subject: [PATCH 42/62] fix build with debug and secure both enabled, issue
 #203

---
 src/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/page.c b/src/page.c
index 23a04a84..6aaef428 100644
--- a/src/page.c
+++ b/src/page.c
@@ -105,7 +105,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
   #if MI_SECURE
-  mi_assert_internal(page->key != 0);
+  mi_assert_internal(page->keys[0] != 0);
   #endif
   if (mi_page_heap(page)!=NULL) {
     mi_segment_t* segment = _mi_page_segment(page);

From f42b8526d0767ae6605f43a198fe984b3d19aa5e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 10:36:39 -0800
Subject: [PATCH 43/62] fix wrong __declspec(restrict) and
 __attribute__((malloc)) attributes on reallocation functions

---
 include/mimalloc.h     | 155 ++++++++++++++++++++---------------------
 src/alloc-aligned.c    |  48 ++++++-------
 src/alloc-posix.c      |  12 ++--
 src/alloc.c            |  66 +++++++++---------
 test/main-override.cpp |  36 +++++++---
 5 files changed, 168 insertions(+), 149 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 552a8b2b..d1120e9f 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -43,9 +43,9 @@ terms of the MIT license. A copy of the license can be found in the file
     #define mi_decl_export              __declspec(dllimport)
   #endif
   #if (_MSC_VER >= 1900) && !defined(__EDG__)
-    #define mi_decl_allocator           __declspec(allocator) __declspec(restrict)
+    #define mi_decl_restrict            __declspec(allocator) __declspec(restrict)
   #else
-    #define mi_decl_allocator           __declspec(restrict)
+    #define mi_decl_restrict            __declspec(restrict)
   #endif
   #define mi_cdecl                      __cdecl
   #define mi_attr_malloc
@@ -55,7 +55,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #elif defined(__GNUC__)                 // includes clang and icc
   #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
   #define mi_decl_export                __attribute__((visibility("default")))
-  #define mi_decl_allocator
+  #define mi_decl_restrict
   #define mi_attr_malloc                __attribute__((malloc))
   #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
     #define mi_attr_alloc_size(s)
@@ -73,7 +73,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
   #define mi_cdecl
   #define mi_decl_export
-  #define mi_decl_allocator
+  #define mi_decl_restrict
   #define mi_attr_malloc
   #define mi_attr_alloc_size(s)
   #define mi_attr_alloc_size2(s1,s2)
@@ -95,15 +95,15 @@ extern "C" {
 // Standard malloc interface
 // ------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc(size_t size)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc(void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_expand(void* p, size_t newsize)                      mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize)      mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void* mi_expand(void* p, size_t newsize)                         mi_attr_noexcept mi_attr_alloc_size(2);
 
 mi_decl_export void mi_free(void* p) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export char* mi_strdup(const char* s) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export char* mi_strndup(const char* s, size_t n) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
 
 // ------------------------------------------------------
 // Extended functionality
@@ -111,13 +111,13 @@ mi_decl_nodiscard mi_decl_export char* mi_realpath(const char* fname, char* reso
 #define MI_SMALL_WSIZE_MAX  (128)
 #define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_mallocn(size_t count, size_t size)            mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size)        mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)                   mi_attr_noexcept mi_attr_alloc_size(2);
 
 mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
@@ -155,14 +155,14 @@ mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_
 // allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
 // -------------------------------------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
 
 
 // -------------------------------------------------------------------------------------
@@ -180,28 +180,28 @@ mi_decl_export mi_heap_t* mi_heap_get_default(void);
 mi_decl_export mi_heap_t* mi_heap_get_backing(void);
 mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);;
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
 
-mi_decl_nodiscard mi_decl_export char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
 
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3) mi_attr_alloc_align(4);
-mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
 
 
 // --------------------------------------------------------------------------------
@@ -211,21 +211,21 @@ mi_decl_nodiscard mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned
 // see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
 // --------------------------------------------------------------------------------
 
-mi_decl_export mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(2,3);
 
-mi_decl_export mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_export mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_export mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
-mi_decl_export mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
 
-mi_decl_export mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
-mi_decl_export mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
 
-mi_decl_export mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3) mi_attr_alloc_align(4);
-mi_decl_export mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
-mi_decl_export mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
-mi_decl_export mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
 
 
 // ------------------------------------------------------
@@ -233,7 +233,6 @@ mi_decl_export mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* he
 // ------------------------------------------------------
 
 mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
-
 mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
 mi_decl_export bool mi_check_owned(const void* p);
 
@@ -323,42 +322,42 @@ mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 // -------------------------------------------------------------------------------------------------------
 // "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
 // (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
 // -------------------------------------------------------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
 mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
 mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
 
-mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
-mi_decl_nodiscard mi_decl_export void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-
-mi_decl_nodiscard mi_decl_export void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
-mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
 
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
 mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
 
-mi_decl_nodiscard mi_decl_export unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept;
-mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept mi_attr_malloc;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name)                      mi_attr_noexcept;
 mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
 
-mi_decl_export void mi_free_size(void* p, size_t size) mi_attr_noexcept;
+mi_decl_export void mi_free_size(void* p, size_t size)                           mi_attr_noexcept;
 mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
-mi_decl_export void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment)                   mi_attr_noexcept;
 
 // The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
 // (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
-mi_decl_nodiscard mi_decl_export void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export void* mi_new_nothrow(size_t size) mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export void* mi_new_n(size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(1, 2);
-mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize) mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size)                   mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size)           mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size)   mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
 
 #ifdef __cplusplus
 }
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 40362068..85408868 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -64,53 +64,53 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
 }
 
 
-mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
 }
 
-mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }
 
-mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
 }
 
-mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
 }
 
-mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
 }
 
-mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
 }
 
-mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }
 
-mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
 }
 
-mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }
 
-mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
 }
 
-mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
 }
 
-mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
 }
 
@@ -153,55 +153,55 @@ static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsi
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
 }
 
-mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
 }
 
-mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
 }
 
-mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
 }
 
-mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
 }
 
-mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
 }
 
-mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
   return mi_heap_rezalloc_aligned(heap, p, total, alignment);
 }
 
-mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }
 
-mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }
 
-mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }
 
-mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
   return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }
 
-mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
 }
 
-mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
 }
 
diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index ffc75373..c74b6082 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -55,24 +55,24 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   return 0;
 }
 
-void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
   void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }
 
-void* mi_valloc(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
   return mi_malloc_aligned(size, _mi_os_page_size());
 }
 
-void* mi_pvalloc(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
   size_t psize = _mi_os_page_size();
   if (size >= SIZE_MAX - psize) return NULL; // overflow
   size_t asize = ((size + psize - 1) / psize) * psize;
   return mi_malloc_aligned(asize, psize);
 }
 
-void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
   if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
   void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
@@ -92,7 +92,7 @@ void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
   return res;
 }
 
-unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
   if (s==NULL) return NULL;
   size_t len;
   for(len = 0; s[len] != 0; len++) { }
@@ -104,7 +104,7 @@ unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
   return p;
 }
 
-unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
   return (unsigned char*)mi_strdup((const char*)s);
 }
 
diff --git a/src/alloc.c b/src/alloc.c
index d2fbe4b1..b080e6fc 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -58,7 +58,7 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 }
 
 // allocate a small block
-extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
   mi_assert(size <= MI_SMALL_SIZE_MAX);
@@ -74,12 +74,12 @@ extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size
   return p;
 }
 
-extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
   return mi_heap_malloc_small(mi_get_default_heap(), size);
 }
 
 // The main allocation function
-extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
     return mi_heap_malloc_small(heap, size);
   }
@@ -98,7 +98,7 @@ extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t siz
   }
 }
 
-extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
   return mi_heap_malloc(mi_get_default_heap(), size);
 }
 
@@ -122,7 +122,7 @@ void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
 }
 
 // zero initialized small block
-mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
   void* p = mi_malloc_small(size);
   if (p != NULL) {
     _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
@@ -138,11 +138,11 @@ void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
   return p;
 }
 
-extern inline mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   return _mi_heap_malloc_zero(heap, size, true);
 }
 
-mi_decl_allocator void* mi_zalloc(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
   return mi_heap_zalloc(mi_get_default_heap(),size);
 }
 
@@ -523,29 +523,29 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
   mi_free(p);
 }
 
-extern inline mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
   return mi_heap_zalloc(heap,total);
 }
 
-mi_decl_allocator void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_calloc(mi_get_default_heap(),count,size);
 }
 
 // Uninitialized `calloc`
-extern mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_malloc(heap, total);
 }
 
-mi_decl_allocator void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_mallocn(mi_get_default_heap(),count,size);
 }
 
 // Expand in place or fail
-mi_decl_allocator void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   if (p == NULL) return NULL;
   size_t size = mi_usable_size(p);
   if (newsize > size) return NULL;
@@ -571,11 +571,11 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
   return newp;
 }
 
-mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, false);
 }
 
-mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_realloc(heap, p, total);
@@ -583,41 +583,41 @@ mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count,
 
 
 // Reallocate but free `p` on errors
-mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   void* newp = mi_heap_realloc(heap, p, newsize);
   if (newp==NULL && p!=NULL) mi_free(p);
   return newp;
 }
 
-mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
   return _mi_heap_realloc_zero(heap, p, newsize, true);
 }
 
-mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
   return mi_heap_rezalloc(heap, p, total);
 }
 
 
-mi_decl_allocator void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_realloc(mi_get_default_heap(),p,newsize);
 }
 
-mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
 }
 
 // Reallocate but free `p` on errors
-mi_decl_allocator void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
 }
 
-mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
   return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
 }
 
-mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
 }
 
@@ -628,7 +628,7 @@ mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_
 // ------------------------------------------------------
 
 // `strdup` using mi_malloc
-char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
   if (s == NULL) return NULL;
   size_t n = strlen(s);
   char* t = (char*)mi_heap_malloc(heap,n+1);
@@ -636,12 +636,12 @@ char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
   return t;
 }
 
-char* mi_strdup(const char* s) mi_attr_noexcept {
+mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
   return mi_heap_strdup(mi_get_default_heap(), s);
 }
 
 // `strndup` using mi_malloc
-char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
   if (s == NULL) return NULL;
   size_t m = strlen(s);
   if (n > m) n = m;
@@ -652,7 +652,7 @@ char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept
   return t;
 }
 
-char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
   return mi_heap_strndup(mi_get_default_heap(),s,n);
 }
 
@@ -663,7 +663,7 @@ char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #define PATH_MAX MAX_PATH
 #endif
 #include <windows.h>
-char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
   DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
@@ -709,7 +709,7 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name)
 }
 #endif
 
-char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
   return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
 }
 #endif
@@ -774,19 +774,19 @@ static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
   return p;
 }
 
-void* mi_new(size_t size) {
+mi_decl_restrict void* mi_new(size_t size) {
   void* p = mi_malloc(size);
   if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
   return p;
 }
 
-void* mi_new_nothrow(size_t size) {
+mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
   void* p = mi_malloc(size);
   if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
   return p;
 }
 
-void* mi_new_aligned(size_t size, size_t alignment) {
+mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -795,7 +795,7 @@ void* mi_new_aligned(size_t size, size_t alignment) {
   return p;
 }
 
-void* mi_new_aligned_nothrow(size_t size, size_t alignment) {
+mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
   void* p;
   do {
     p = mi_malloc_aligned(size, alignment);
@@ -804,7 +804,7 @@ void* mi_new_aligned_nothrow(size_t size, size_t alignment) {
   return p;
 }
 
-void* mi_new_n(size_t count, size_t size) {
+mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
   size_t total;
   if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
     mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
diff --git a/test/main-override.cpp b/test/main-override.cpp
index fcf3970f..490f1fb8 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -8,6 +8,25 @@
 #include <new>
 #include <vector>
 
+#include <thread>
+#include <mimalloc.h>
+#include <assert.h>
+
+// Issue #202
+void thread_main() {
+  mi_heap_t* heap = mi_heap_new();
+  void* q = mi_heap_malloc(heap,1024);
+  // mi_heap_delete(heap); // uncomment to prevent assertion
+}
+
+int main() {
+  auto t1 = std::thread(thread_main);
+  t1.join();
+  return 0;
+}
+
+/*
+
 static void* p = malloc(8);
 
 void free_p() {
@@ -32,13 +51,13 @@ int main() {
   free(p1);  
   p1 = malloc(8);
   char* s = mi_strdup("hello\n");
-  /*
-  char* s = _strdup("hello\n");
-  char* buf = NULL;
-  size_t len;
-  _dupenv_s(&buf,&len,"MIMALLOC_VERBOSE"); 
-  mi_free(buf);
-  */
+  
+  //char* s = _strdup("hello\n");
+  //char* buf = NULL;
+  //size_t len;
+  //_dupenv_s(&buf,&len,"MIMALLOC_VERBOSE"); 
+  //mi_free(buf);
+  
   mi_free(p2);
   p2 = malloc(16);
   p1 = realloc(p1, 32);
@@ -84,4 +103,5 @@ bool test_stl_allocator2() {
   vec.push_back(some_struct());
   vec.pop_back();
   return vec.size() == 0;
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file

From 946a71c4a957ac3a74c1270be44dcf8b32e254ae Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 11:37:48 -0800
Subject: [PATCH 44/62] fix issue #204 (and #205) by doing thread delayed free
 after absorbing the pages

---
 src/heap.c             | 30 +++++++++----------
 test/main-override.cpp | 66 ++++++++++++++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index 93275747..900cef65 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -312,33 +312,29 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   mi_assert_internal(heap!=NULL);
   if (from==NULL || from->page_count == 0) return;
 
-  // unfull all full pages in the `from` heap
-  mi_page_t* page = from->pages[MI_BIN_FULL].first;
-  while (page != NULL) {
-    mi_page_t* next = page->next;
-    _mi_page_unfull(page);
-    page = next;
-  }
-  mi_assert_internal(from->pages[MI_BIN_FULL].first == NULL);
-
-  // free outstanding thread delayed free blocks
+  // reduce the size of the delayed frees
   _mi_heap_delayed_free(from);
-
-  // transfer all pages by appending the queues; this will set
-  // a new heap field which is ok as all pages are unfull'd and thus
-  // other threads won't access this field anymore (see `mi_free_block_mt`)
-  for (size_t i = 0; i < MI_BIN_FULL; i++) {
+  
+  // transfer all pages by appending the queues; this will set a new heap field 
+  // so threads may do delayed frees in either heap for a while.
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_queue_t* append = &from->pages[i];
     size_t pcount = _mi_page_queue_append(heap, pq, append);
     heap->page_count += pcount;
     from->page_count -= pcount;
   }
-  mi_assert_internal(from->thread_delayed_free == NULL);
   mi_assert_internal(from->page_count == 0);
 
+  // and do outstanding delayed frees in the `from` heap  
+  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
+  // the regular `_mi_free_delayed_block` which is safe.
+  _mi_heap_delayed_free(from);
+  mi_assert_internal(from->thread_delayed_free == NULL);
+
   // and reset the `from` heap
-  mi_heap_reset_pages(from);
+  mi_heap_reset_pages(from);  
 }
 
 // Safe delete a heap without freeing any still allocated blocks in that heap.
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 490f1fb8..957b7872 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -12,21 +12,27 @@
 #include <mimalloc.h>
 #include <assert.h>
 
-// Issue #202
-void thread_main() {
-  mi_heap_t* heap = mi_heap_new();
-  void* q = mi_heap_malloc(heap,1024);
-  // mi_heap_delete(heap); // uncomment to prevent assertion
-}
+#ifdef _WIN32
+#include <windows.h>
+static void msleep(unsigned long msecs) { Sleep(msecs); }
+#else
+#include <unistd.h>
+static void msleep(unsigned long msecs) { usleep(msecs * 1000UL); }
+#endif
+
+void heap_no_delete();
+void heap_late_free();
+void various_tests();
 
 int main() {
-  auto t1 = std::thread(thread_main);
-  t1.join();
+  mi_stats_reset();  // ignore earlier allocations
+  // heap_no_delete();  // issue #202
+  // heap_late_free();  // issue #204
+  various_tests();
+  mi_stats_print(NULL);
   return 0;
 }
 
-/*
-
 static void* p = malloc(8);
 
 void free_p() {
@@ -43,8 +49,7 @@ public:
 };
 
 
-int main() {
-  mi_stats_reset();  // ignore earlier allocations
+void various_tests() {  
   atexit(free_p);
   void* p1 = malloc(78);
   void* p2 = mi_malloc_aligned(16,24);
@@ -68,8 +73,6 @@ int main() {
   delete t;
   t = new (std::nothrow) Test(42);
   delete t;  
-  mi_stats_print(NULL);
-  return 0;
 }
 
 class Static {
@@ -104,4 +107,37 @@ bool test_stl_allocator2() {
   vec.pop_back();
   return vec.size() == 0;
 }
-*/
\ No newline at end of file
+
+
+
+// Issue #202
+void heap_no_delete_worker() {
+  mi_heap_t* heap = mi_heap_new();
+  void* q = mi_heap_malloc(heap,1024);
+  // mi_heap_delete(heap); // uncomment to prevent assertion
+}
+
+void heap_no_delete() {
+  auto t1 = std::thread(heap_no_delete_worker);
+  t1.join();  
+}
+
+
+// Issue #204
+volatile void* global_p;
+
+void t1main() {
+  mi_heap_t* heap = mi_heap_new();
+  global_p = mi_heap_malloc(heap, 1024);
+  mi_heap_delete(heap);
+}
+
+void heap_late_free() {
+  auto t1 = std::thread(t1main);
+
+  msleep(2000);
+  assert(global_p);
+  mi_free((void*)global_p);
+
+  t1.join();
+}
\ No newline at end of file

From 8a2a52843d36a361c3e9a42f37240cce5baab517 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 12:15:23 -0800
Subject: [PATCH 45/62] delete all thread owned heaps when a thread is
 terminated (issue #202)

---
 include/mimalloc-types.h |  2 ++
 src/heap.c               | 21 ++++++++++++++++++++-
 src/init.c               | 23 +++++++++++++++++++----
 test/main-override.cpp   |  2 +-
 4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 71f3ae80..dc85bbcd 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -329,6 +329,7 @@ struct mi_heap_s {
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
+  mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
 };
 
@@ -469,6 +470,7 @@ struct mi_tld_s {
   unsigned long long  heartbeat;     // monotonic heartbeat count
   bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
   mi_segments_tld_t   segments;      // segment tld
   mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
diff --git a/src/heap.c b/src/heap.c
index 900cef65..0bf26988 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -191,7 +191,7 @@ mi_heap_t* mi_heap_get_backing(void) {
 
 mi_heap_t* mi_heap_new(void) {
   mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap==NULL) return NULL;
   memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->tld = bheap->tld;
@@ -201,6 +201,9 @@ mi_heap_t* mi_heap_new(void) {
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
   heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
   return heap;
 }
 
@@ -230,6 +233,22 @@ static void mi_heap_free(mi_heap_t* heap) {
   if (mi_heap_is_default(heap)) {
     _mi_heap_set_default_direct(heap->tld->heap_backing);
   }
+
+  // remove ourselves from the thread local heaps list
+  // linear search but we expect the number of heaps to be relatively small
+  mi_heap_t* prev = NULL;
+  mi_heap_t* curr = heap->tld->heaps; 
+  while (curr != heap && curr != NULL) {
+    prev = curr;
+    curr = curr->next;
+  }
+  mi_assert_internal(curr == heap);
+  if (curr == heap) {
+    if (prev != NULL) { prev->next = heap->next; }
+                 else { heap->tld->heaps = heap->next; }
+  }
+  mi_assert_internal(heap->tld->heaps != NULL);
+
   // and free the used memory
   mi_free(heap);
 }
diff --git a/src/init.c b/src/init.c
index 2f5ca224..2c9dec1a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,6 +97,7 @@ const mi_heap_t _mi_heap_empty = {
   { 0, 0 },         // keys
   { {0}, {0}, 0 },
   0,                // page count
+  NULL,             // next
   false
 };
 
@@ -111,7 +112,7 @@ extern mi_heap_t _mi_heap_main;
 
 static mi_tld_t tld_main = {
   0, false,
-  &_mi_heap_main,
+  &_mi_heap_main, &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
     0, 0, 0, 0, 0, 0, NULL,
     tld_main_stats, tld_main_os
@@ -130,6 +131,7 @@ mi_heap_t _mi_heap_main = {
   { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
   { {0x846ca68b}, {0}, 0 },  // random
   0,                // page count
+  NULL,             // next heap
   false             // can reclaim
 };
 
@@ -192,6 +194,7 @@ static bool _mi_heap_init(void) {
     heap->keys[1] = _mi_heap_random_next(heap);
     heap->tld = tld;
     tld->heap_backing = heap;
+    tld->heaps = heap;
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
     tld->os.stats = &tld->stats;
@@ -207,12 +210,24 @@ static bool _mi_heap_done(mi_heap_t* heap) {
   // reset default heap
   _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
 
-  // todo: delete all non-backing heaps?
-
-  // switch to backing heap and free it
+  // switch to backing heap
   heap = heap->tld->heap_backing;
   if (!mi_heap_is_initialized(heap)) return false;
 
+
+  // delete all non-backing heaps in this thread
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
+    if (curr != heap) {
+      mi_assert_internal(!mi_heap_is_backing(curr));
+      mi_heap_delete(curr);
+    }
+    curr = next;
+  }
+  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
+  mi_assert_internal(mi_heap_is_backing(heap));
+
   // collect if not the main thread
   if (heap != &_mi_heap_main) {
     _mi_heap_collect_abandon(heap);
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 957b7872..b4ce4c1c 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -28,7 +28,7 @@ int main() {
   mi_stats_reset();  // ignore earlier allocations
   // heap_no_delete();  // issue #202
   // heap_late_free();  // issue #204
-  various_tests();
+  // various_tests();
   mi_stats_print(NULL);
   return 0;
 }

From af37302e8327b610513a2762769bd23c006d8565 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 12:27:06 -0800
Subject: [PATCH 46/62] add assertions

---
 src/heap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/heap.c b/src/heap.c
index 0bf26988..b1771264 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -226,6 +226,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
 
 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
 static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
   mi_assert_internal(mi_heap_is_initialized(heap));
   if (mi_heap_is_backing(heap)) return; // dont free the backing heap
 
@@ -305,6 +306,7 @@ void _mi_heap_destroy_pages(mi_heap_t* heap) {
 }
 
 void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
   mi_assert(mi_heap_is_initialized(heap));
   mi_assert(heap->no_reclaim);
   mi_assert_expensive(mi_heap_is_valid(heap));
@@ -359,6 +361,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
 {
+  mi_assert(heap != NULL);
   mi_assert(mi_heap_is_initialized(heap));
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (!mi_heap_is_initialized(heap)) return;

From e981e9227eb0237da1ff3e2909b96c671c5c115a Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 13 Feb 2020 13:12:19 -0800
Subject: [PATCH 47/62] ensure thread delayed freeing is correct during
 heap_absorb; #204

---
 src/heap.c             | 4 +++-
 src/page-queue.c       | 5 +++++
 test/main-override.cpp | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index b1771264..5d0d4b8a 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -338,6 +338,8 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   
   // transfer all pages by appending the queues; this will set a new heap field 
   // so threads may do delayed frees in either heap for a while.
+  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+  // so after this only the new heap will get delayed frees
   for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_queue_t* append = &from->pages[i];
@@ -351,7 +353,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   // note: be careful here as the `heap` field in all those pages no longer point to `from`,
   // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
   // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free(from);
+  _mi_heap_delayed_free(from);  
   mi_assert_internal(from->thread_delayed_free == NULL);
 
   // and reset the `from` heap
diff --git a/src/page-queue.c b/src/page-queue.c
index 68e2aaa4..b2687c92 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -329,6 +329,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }
 
+// Only called from `mi_heap_absorb`.
 size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
   mi_assert_internal(mi_heap_contains_queue(heap,pq));
   mi_assert_internal(pq->block_size == append->block_size);
@@ -339,6 +340,10 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
     mi_page_set_heap(page,heap);
+    // set it to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
+    // that after appending only the new heap will be used for delayed free operations.
+    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
     count++;
   }
 
diff --git a/test/main-override.cpp b/test/main-override.cpp
index b4ce4c1c..957b7872 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -28,7 +28,7 @@ int main() {
   mi_stats_reset();  // ignore earlier allocations
   // heap_no_delete();  // issue #202
   // heap_late_free();  // issue #204
-  // various_tests();
+  various_tests();
   mi_stats_print(NULL);
   return 0;
 }

From 67de2549cf8585250e17501e714c83a21746b20b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 14 Feb 2020 09:40:56 -0800
Subject: [PATCH 48/62] fix build with clang-cl due to _Check_return_ (issue
 #200)

---
 include/mimalloc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index d1120e9f..f057c78d 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -26,9 +26,9 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #if (__cplusplus >= 201703)
   #define mi_decl_nodiscard    [[nodiscard]]
-#elif (__GNUC__ >= 4)         
+#elif (__GNUC__ >= 4) || defined(__clang__)  // includes clang, icc, and clang-cl
   #define mi_decl_nodiscard    __attribute__((warn_unused_result))
-#elif (_MSC_VER >= 1700)
+#elif (_MSC_VER >= 1700) 
   #define mi_decl_nodiscard    _Check_return_
 #else 
   #define mi_decl_nodiscard 

From 3e198cc87d7578f26b9dfe76731fc81a27687440 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 14 Feb 2020 11:11:57 -0800
Subject: [PATCH 49/62] fix too strict assertion (issue #204)

---
 src/page-queue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/page-queue.c b/src/page-queue.c
index b2687c92..ea213019 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -339,8 +339,10 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   // set append pages to new heap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    mi_page_set_heap(page,heap);
-    // set it to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
+    mi_atomic_write(&page->xheap, (uintptr_t)heap); 
+    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
     // side effect that it spins until any DELAYED_FREEING is finished. This ensures
     // that after appending only the new heap will be used for delayed free operations.
     _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);

From 16ebb70e4c8a342e6453148397217e604fe45ee4 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 17 Feb 2020 09:15:48 -0800
Subject: [PATCH 50/62] strengthen aligment guarantee (issue #206) (reverse
 commit 4531367)

---
 src/alloc-aligned.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 85408868..8be2e598 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -20,8 +20,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
 
   if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
-  if (alignment <= MI_MAX_ALIGN_SIZE && offset==0) return _mi_heap_malloc_zero(heap, size, zero);
+  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)  
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment

From f2ac272baaba126745a70b92bf0b8887fd3aedd6 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 17 Feb 2020 09:59:11 -0800
Subject: [PATCH 51/62] strengthen alignment check for memalign and
 aligned_alloc

---
 include/mimalloc-internal-tld.h | 722 ++++++++++++++++++++++++++++++++
 include/mimalloc-internal.h     |   4 +
 src/alloc-aligned.c             |   2 +-
 src/alloc-posix.c               |  10 +-
 4 files changed, 732 insertions(+), 6 deletions(-)
 create mode 100644 include/mimalloc-internal-tld.h

diff --git a/include/mimalloc-internal-tld.h b/include/mimalloc-internal-tld.h
new file mode 100644
index 00000000..ce67b0c7
--- /dev/null
+++ b/include/mimalloc-internal-tld.h
@@ -0,0 +1,722 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_TLD_H
+#define MIMALLOC_INTERNAL_TLD_H
+
+#include "mimalloc-types.h"
+#include "mimalloc-internal.h"
+
+#define MI_TLD_DECL           1    // thread local declaration
+#define MI_TLD_PTHREAD        2    // ptrhead_get/setspecific
+#define MI_TLD_DECL_GUARD     3    // thread local + recursion guard at initial load
+#define MI_TLD_PTHREAD_GUARD  4    // ptrhead_get/setspecific + recursion guard at initial load
+#define MI_TLD_SLOT           5    // steal slot from OS thread local predefined slots
+#define MI_TLD_PTHREAD_SLOT   6    // steal slot from pthread structure (usually `retval`)
+
+
+#if !defined(MI_TLD)
+#if defined(_MSC_VER) || defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__)
+  // on windows and linux/freeBSD/netBSD (with initial-exec) a __thread always works without recursion into malloc
+  #define MI_TLD    MI_TLD_DECL
+#elif !defined(MI_MIMALLOC_OVERRIDE)
+  // if not overriding, __thread declarations should be fine (use MI_TLD_PTHREAD if your OS does not have __thread)
+  #define MI_TLD    MI_TLD_DECL
+#elif // defined(MI_MALLOC_OVERRIDE)
+  // if overriding, some BSD variants allocate when accessing a thread local the first time
+  #if defined(__APPLE__)
+    #define MI_TLD  MI_TLD_SLOT
+    #define MI_TLD_SLOT_NUM   89      // seems unused? (__PTK_FRAMEWORK_OLDGC_KEY9) see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+                                      // possibly unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+    // #define MI_TLD                    MI_TLD_PTHREAD_SLOT
+    // #define MI_TLD_PTHREAD_SLOT_OFS   (2*sizeof(void*) + sizeof(long) + 2*sizeof(void*) /*TAILQ*/)  // offset `tl_exit_value` <https://github.com/apple/darwin-libpthread/blob/master/src/internal.h#L184>
+  #elif defined(__OpenBSD__)
+    #define MI_TLD                    MI_TLD_PTHREAD_SLOT
+    #define MI_TLD_PTHREAD_SLOT_OFS   (6*sizeof(int) + 1*sizeof(void*))  // offset `retval` <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+  #elif defined(__DragonFly__)
+    #define MI_TLD                    MI_TLD_PTHREAD_SLOT
+    #define MI_TLD_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+  #endif
+  #endif
+#endif
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#define MI_CACHE_LINE          64
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#elif (defined(__GNUC__) && (__GNUC__>=3))  // includes clang and icc
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#else
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_cache_align
+#endif
+
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_stats_t       _mi_stats_main;
+extern const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+bool       _mi_preloading();  // true while the C runtime is not ready
+
+// os.c
+size_t     _mi_os_page_size(void);
+void       _mi_os_init(void);                                      // called from process init
+void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
+void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+size_t     _mi_os_good_alloc_size(size_t size);
+
+// memory.c
+void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
+void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);
+
+bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
+bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
+bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
+bool       _mi_mem_protect(void* addr, size_t size);
+bool       _mi_mem_unprotect(void* addr, size_t size);
+
+void        _mi_mem_collect(mi_os_tld_t* tld);
+
+// "segment.c"
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+void       _mi_abandoned_await_readers(void);
+
+
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page);                                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD in "os.c"
+
+// "heap.c"
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero);
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero);
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     __builtin_expect((x),0)
+#define mi_likely(x)       __builtin_expect((x),1)
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+#include <errno.h>
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0)
+#define UNUSED_RELEASE(x)
+#else
+#define UNUSED_RELEASE(x)  UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+
+// Overflow detecting multiply
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+#if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
+#include <limits.h>   // UINT_MAX, ULONG_MAX
+#if (SIZE_MAX == UINT_MAX)
+  return __builtin_umul_overflow(count, size, total);
+#elif (SIZE_MAX == ULONG_MAX)
+  return __builtin_umull_overflow(count, size, total);
+#else
+  return __builtin_umulll_overflow(count, size, total);
+#endif
+#else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  *total = count * size;
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
+          && size > 0 && (SIZE_MAX / size) < count);
+#endif
+}
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if (mi_unlikely(mi_mul_overflow(count, size, total))) {
+    _mi_error_message(EOVERFLOW, "allocation request too large (%zu * %zu bytes)\n", count, size);
+    *total = SIZE_MAX;
+    return true;
+  }
+  else return false;
+}
+
+
+/* -----------------------------------------------------------
+  The thread local default heap
+----------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
+extern bool _mi_process_is_initialized;
+
+#if defined(MI_TLS_OSX_FAST)
+#define MI_TLS_OSX_OFFSET  (MI_TLS_OSX_SLOT*sizeof(void*))
+static inline void* mi_tls_osx_fast_get(void) {
+  void* ret;
+  __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET)));
+  return ret;
+}
+static inline void mi_tls_osx_fast_set(void* value) {
+  __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value));
+}
+#elif defined(MI_TLS_PTHREADS)
+extern pthread_key_t  _mi_heap_default_key;
+#else
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+#endif
+
+
+static inline mi_heap_t* mi_get_default_heap(void) {
+#if defined(MI_TLS_OSX_FAST)
+  // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc).
+  // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) <https://github.com/apportable/Foundation/blob/master/System/System/src/pthread_machdep.h>
+  // which seems unused except for the more recent Webkit <https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/FastTLS.h>
+  // Use with care.
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREADS)
+  // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc`
+  // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed.
+  // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good).
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? (mi_heap_t*)&_mi_heap_empty : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)
+  // On some BSD platforms, like openBSD, the dynamic loader calls `malloc`
+  // to initialize thread local data (before our module is loaded).
+  // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap`
+  // until our module is loaded and use the statically allocated main heap until that time.
+  // TODO: patch ourselves dynamically to avoid this check every time?
+  // if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main;
+  #endif
+  return _mi_heap_default;
+#endif
+}
+
+static inline bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_get_default_heap());
+}
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  mi_assert_internal(_mi_heap_main.cookie != 0);
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+// Get the page belonging to a certain size class
+static inline mi_page_t* _mi_get_free_small_page(size_t size) {
+  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
+}
+
+// Segment that contains the pointer
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  // mi_assert_internal(p != NULL);
+  return (mi_segment_t*)((uintptr_t)p & ~MI_SEGMENT_MASK);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
+  return segment;
+}
+
+// used internally
+static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
+  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
+  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
+  mi_assert_internal(idx < segment->capacity);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
+  return &((mi_segment_t*)segment)->pages[idx];
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
+  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Get the block size of a page (special cased for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if (mi_likely(bsize < MI_HUGE_BLOCK_SIZE)) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, bsize, &psize, NULL);
+    return psize;
+  }
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_read_relaxed(&page->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_read_relaxed(&page->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_write(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  uintptr_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  uintptr_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
+}
+
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return ((x << shift) | (x >> (MI_INTPTR_BITS - shift)));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return ((x >> shift) | (x << (MI_INTPTR_BITS - shift)));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (mi_unlikely(p==null) ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  #ifdef MI_ENCODE_FREELIST
+  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  #else
+  UNUSED(keys); UNUSED(null);
+  return (mi_block_t*)block->next;
+  #endif
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  #ifdef MI_ENCODE_FREELIST
+  block->next = mi_ptr_encode(null, next, keys);
+  #else
+  UNUSED(keys); UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern size_t _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(_mi_numa_node_count == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
+// -------------------------------------------------------------------
+// Getting the thread id should be performant
+// as it is called in the fast path of `_mi_free`,
+// so we specialize for various platforms.
+// -------------------------------------------------------------------
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
+// TLS register on x86 is in the FS or GS register
+// see: https://akkadia.org/drepper/tls.pdf
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  uintptr_t tid;
+  #if defined(__i386__)
+  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
+  #elif defined(__MACH__)
+  __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__)
+  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+  #elif defined(__aarch64__)
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+  #endif
+  return tid;
+}
+#else
+// otherwise use standard C
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+#endif
+
+
+#endif
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index cea6b9c3..d0c0b3f3 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -232,6 +232,10 @@ static inline size_t _mi_wsize_from_size(size_t size) {
   return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }
 
+// Does malloc satisfy the alignment constraints already?
+static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size) {
+  return (alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)));
+}
 
 // Overflow detecting multiply
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 8be2e598..7eeb9e92 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -20,7 +20,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
 
   if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)  
+  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index c74b6082..4395893b 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -48,7 +48,7 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   if (p == NULL) return EINVAL;
   if (alignment % sizeof(void*) != 0) return EINVAL;   // natural alignment
   if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
-  void* q = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  void* q = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
   if (q==NULL && size != 0) return ENOMEM;
   mi_assert_internal(((uintptr_t)q % alignment) == 0);
   *p = q;
@@ -56,26 +56,26 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
 }
 
 mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
-  void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  void* p = (mi_malloc_satisfies_alignment(alignment,size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }
 
 mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
-  return mi_malloc_aligned(size, _mi_os_page_size());
+  return mi_memalign( _mi_os_page_size(), size );
 }
 
 mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
   size_t psize = _mi_os_page_size();
   if (size >= SIZE_MAX - psize) return NULL; // overflow
-  size_t asize = ((size + psize - 1) / psize) * psize;
+  size_t asize = _mi_align_up(size, psize);
   return mi_malloc_aligned(asize, psize);
 }
 
 mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
   if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-  void* p = (alignment <= MI_MAX_ALIGN_SIZE ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  void* p = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }

From baf08e8d546b52e3c6773582230aa3f0a6f539f2 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 17 Feb 2020 09:59:34 -0800
Subject: [PATCH 52/62] fix size check on overflow when padding is enabled in
 debug mode

---
 src/alloc.c     |  4 ++--
 src/page.c      | 11 +++++++----
 test/test-api.c | 42 ++++++++++++++++++++++++++++--------------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index b080e6fc..b1c4cd34 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -25,7 +25,7 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* block = page->free;
   if (mi_unlikely(block == NULL)) {
-    return _mi_malloc_generic(heap, size); // slow path
+    return _mi_malloc_generic(heap, size); 
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
@@ -86,7 +86,7 @@ extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size
   else {
     mi_assert(heap!=NULL);
     mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);      // note: size can overflow but it is detected in malloc_generic
     mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
     #if MI_STAT>1
     if (p != NULL) {
diff --git a/src/page.c b/src/page.c
index 6aaef428..ef8a69e5 100644
--- a/src/page.c
+++ b/src/page.c
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* -----------------------------------------------------------
   The core of the allocator. Every segment contains
-  pages of a certain block size. The main function
+  pages of a {certain block size. The main function
   exported is `mi_malloc_generic`.
 ----------------------------------------------------------- */
 
@@ -774,6 +774,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
 
 
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
 {
   mi_assert_internal(heap != NULL);
@@ -793,9 +794,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
 
   // huge allocation?
   mi_page_t* page;
-  if (mi_unlikely(size > MI_LARGE_OBJ_SIZE_MAX)) {
-    if (mi_unlikely(size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", size);
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if (mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
+    if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", req_size);
       return NULL;
     }
     else {
@@ -804,6 +806,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
+    mi_assert_internal(size >= MI_PADDING_SIZE);
     page = mi_find_free_page(heap,size);
   }
   if (mi_unlikely(page == NULL)) { // out of memory
diff --git a/test/test-api.c b/test/test-api.c
index 2d26e14d..166cfca6 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -31,7 +31,7 @@ we therefore test the API over various inputs. Please add more tests :-)
 #endif
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+// #include "mimalloc-internal.h"
 
 // ---------------------------------------------------------------------------
 // Test macros: CHECK(name,predicate) and CHECK_BODY(name,body)
@@ -98,38 +98,34 @@ int main() {
 
   // ---------------------------------------------------
   // Extended
-  // ---------------------------------------------------
-  #if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+  // ---------------------------------------------------  
   CHECK_BODY("posix_memalign1", {
     void* p = &p;
-    int err = posix_memalign(&p, sizeof(void*), 32);
-    mi_assert((err==0 && (uintptr_t)p % sizeof(void*) == 0) || p==&p);
+    int err = mi_posix_memalign(&p, sizeof(void*), 32);
+    result = ((err==0 && (uintptr_t)p % sizeof(void*) == 0) || p==&p);
     mi_free(p);
-    result = (err==0);
   });
   CHECK_BODY("posix_memalign_no_align", {
     void* p = &p;
-    int err = posix_memalign(&p, 3, 32);
-    mi_assert(p==&p);
-    result = (err==EINVAL);
+    int err = mi_posix_memalign(&p, 3, 32);
+    result = (err==EINVAL && p==&p);
   });
   CHECK_BODY("posix_memalign_zero", {
     void* p = &p;
-    int err = posix_memalign(&p, sizeof(void*), 0);
+    int err = mi_posix_memalign(&p, sizeof(void*), 0);
     mi_free(p);
     result = (err==0);
   });
   CHECK_BODY("posix_memalign_nopow2", {
     void* p = &p;
-    int err = posix_memalign(&p, 3*sizeof(void*), 32);
+    int err = mi_posix_memalign(&p, 3*sizeof(void*), 32);
     result = (err==EINVAL && p==&p);
   });
   CHECK_BODY("posix_memalign_nomem", {
     void* p = &p;
-    int err = posix_memalign(&p, sizeof(void*), SIZE_MAX);
+    int err = mi_posix_memalign(&p, sizeof(void*), SIZE_MAX);
     result = (err==ENOMEM && p==&p);
   });
-  #endif
 
   // ---------------------------------------------------
   // Aligned API
@@ -147,12 +143,30 @@ int main() {
     mi_free(p1);
     result = (result1&&result2);
   });
+  CHECK_BODY("malloc-aligned4", {
+    void* p;
+    bool ok = true;
+    for (int i = 0; i < 8 && ok; i++) {
+      p = mi_malloc_aligned(8, 16);
+      ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p);
+    }
+    result = ok;
+  });
   CHECK_BODY("malloc-aligned-at1", {
     void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p);
   });
   CHECK_BODY("malloc-aligned-at2", {
     void* p = mi_malloc_aligned_at(50,32,8); result = (p != NULL && ((uintptr_t)(p) + 8) % 32 == 0); mi_free(p);
-  });
+  });  
+  CHECK_BODY("memalign1", {
+    void* p;
+    bool ok = true;
+    for (int i = 0; i < 8 && ok; i++) {
+      p = mi_memalign(16,8);
+      ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p);
+    }
+    result = ok;
+    });
 
   // ---------------------------------------------------
   // Heaps

From 82684042be1be44d34caecc915fb51755278d843 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 17 Feb 2020 10:10:22 -0800
Subject: [PATCH 53/62] bump version to 1.6.1

---
 include/mimalloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index f057c78d..85f25ffb 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 160   // major + 2 digits minor
+#define MI_MALLOC_VERSION 161   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes

From 6e1ca96a4965c776c10698c24dae576523178ef5 Mon Sep 17 00:00:00 2001
From: Daan <daan@microsoft.com>
Date: Mon, 17 Feb 2020 10:19:29 -0800
Subject: [PATCH 54/62] Update readme.md

---
 readme.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/readme.md b/readme.md
index e4e96ba7..423c91b9 100644
--- a/readme.md
+++ b/readme.md
@@ -11,7 +11,7 @@ mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
 Initially developed by Daan Leijen for the run-time systems of the
 [Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
-Latest release:`v1.6.0` (2020-02-09).
+Latest release:`v1.6.1` (2020-02-17).
 
 It is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -57,6 +57,7 @@ Enjoy!
 
 ### Releases
 
+* 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects).
 * 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
   and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
   heap block overflow detection in debug mode (besides the double-free detection and free-list
@@ -275,8 +276,7 @@ resolved to the _mimalloc_ library.
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
-Note: unfortunately, at this time, dynamic overriding on macOS seems broken but it is
-actively worked on to fix this (see issue [`#50`](https://github.com/microsoft/mimalloc/issues/50)).
+(Note: macOS support for dynamic overriding is recent, please report any issues.)
 
 ### Override on Windows
 

From ec61224db0cf6b851b1b116d4387bd8404242c95 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Tue, 18 Feb 2020 20:05:30 -0800
Subject: [PATCH 55/62] fix padding issue with zero sized allocation (issue
 #209)

---
 src/alloc.c            |  5 +++++
 test/main-override.cpp | 18 +++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/alloc.c b/src/alloc.c
index b1c4cd34..efa35f58 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -62,6 +62,11 @@ extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
   mi_assert(size <= MI_SMALL_SIZE_MAX);
+  #if (MI_PADDING)
+  if (size == 0) {
+    size = sizeof(void*);
+  }
+  #endif
   mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
   void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
   mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 957b7872..18d49df3 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -22,12 +22,14 @@ static void msleep(unsigned long msecs) { usleep(msecs * 1000UL); }
 
 void heap_no_delete();
 void heap_late_free();
+void padding_shrink();
 void various_tests();
 
 int main() {
   mi_stats_reset();  // ignore earlier allocations
   // heap_no_delete();  // issue #202
   // heap_late_free();  // issue #204
+  padding_shrink();  // issue #209
   various_tests();
   mi_stats_print(NULL);
   return 0;
@@ -140,4 +142,18 @@ void heap_late_free() {
   mi_free((void*)global_p);
 
   t1.join();
-}
\ No newline at end of file
+}
+
+// issue  #209
+static void* shared_p;
+static void alloc0(/* void* arg */)
+{
+  shared_p = mi_malloc(8);
+}
+
+void padding_shrink(void)
+{
+  auto t1 = std::thread(alloc0);
+  t1.join();
+  mi_free(shared_p);
+}

From b41183e8a36bbe815ff5a8897d27aa2581c44601 Mon Sep 17 00:00:00 2001
From: Orkhan Hasanli <ohasanli@yahoo.ca>
Date: Wed, 4 Mar 2020 20:45:20 -0500
Subject: [PATCH 56/62] Creation of symbolic link was failing. #166

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e16830aa..c268b7a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,7 +216,7 @@ if(NOT WIN32)
   # install a symlink in the /usr/local/lib to the versioned library
   set(mi_symlink "${CMAKE_SHARED_MODULE_PREFIX}${mi_basename}${CMAKE_SHARED_LIBRARY_SUFFIX}")
   set(mi_soname "mimalloc-${mi_version}/${mi_symlink}.${mi_version}")
-  install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${mi_soname} ${mi_symlink} WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/${mi_install_dir}/..)")
+  install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${mi_soname} ${mi_symlink} WORKING_DIRECTORY ${mi_install_dir}/..)")
   install(CODE "MESSAGE(\"-- Symbolic link: ${CMAKE_INSTALL_PREFIX}/lib/${mi_symlink} -> ${mi_soname}\")")
 endif()
 

From ab202fbe7399d5851c6913fc11626b234020b551 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 6 Mar 2020 15:53:07 -0800
Subject: [PATCH 57/62] never free the main heap structures; issues #207

---
 src/init.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 2c9dec1a..0fd5454c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -241,7 +241,9 @@ static bool _mi_heap_done(mi_heap_t* heap) {
     mi_assert_internal(heap->tld->segments.count == 0);
     _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
   }
-#if (MI_DEBUG > 0)
+#if 0  
+  // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+  // there may still be delete/free calls after the mi_fls_done is called. Issue #207
   else {
     _mi_heap_destroy_pages(heap);
     mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);

From 5bc276c23b1d54f4aad92eae7fe7f572028a56ba Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 6 Mar 2020 16:24:51 -0800
Subject: [PATCH 58/62] fix issue #208: dynamic unloading of DLL with
 statically linked mimalloc

---
 src/init.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 0fd5454c..cfb44ecf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -485,6 +485,10 @@ static void mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  FlsSetValue(mi_fls_key, NULL);  // don't call main-thread callback
+  FlsFree(mi_fls_key);            // call thread-done on all threads to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  #endif
   #ifndef NDEBUG
   mi_collect(true);
   #endif
@@ -492,7 +496,7 @@ static void mi_process_done(void) {
       mi_option_is_enabled(mi_option_verbose)) {
     mi_stats_print(NULL);
   }
-  mi_allocator_done();
+  mi_allocator_done();  
   _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }

From 72f758c433d9a8ece82404a711b474aae8c7cf7e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 6 Mar 2020 16:43:39 -0800
Subject: [PATCH 59/62] fix issue #210 where multiple static instances of
 mimalloc in DLL's compete for the same virtual memory area

---
 src/os.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/os.c b/src/os.c
index 0aa85bd6..89fd349b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -209,7 +209,12 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
   if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
-    return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+    void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+    if (p != NULL) return p;
+    DWORD err = GetLastError();
+    if (err != ERROR_INVALID_ADDRESS) { // if linked with multiple instances, we may have tried to allocate at an already allocated area
+      return NULL;
+    }
   }
 #endif
 #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)

From 854e81c11dfc865584299515b70d0b91a5e4191b Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Sat, 7 Mar 2020 09:18:45 +0000
Subject: [PATCH 60/62] build fix for arm, adding native arch flag to be able
 to generate yield asm instruction.

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c268b7a9..fffac46a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,11 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
   endif()
 endif()
 
+# Architecture flags
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+    list(APPEND mi_cflags -march=native)
+endif()
+
 # extra needed libraries
 if(WIN32)
   list(APPEND mi_libraries psapi shell32 user32 bcrypt)

From 2884affbd70da897473603b4009aca2d63bc0a7b Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Sat, 7 Mar 2020 12:19:48 +0000
Subject: [PATCH 61/62] Using host data rather.

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fffac46a..a0893007 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,7 +132,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
 endif()
 
 # Architecture flags
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm")
     list(APPEND mi_cflags -march=native)
 endif()
 

From 7745dde8d257010da34b4eb8d7d6246b99631ac9 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 16 Mar 2020 15:31:37 -0700
Subject: [PATCH 62/62] allow retirement for all object sizes (issue #212)

---
 include/mimalloc-types.h |  2 ++
 src/init.c               |  2 ++
 src/page.c               | 27 ++++++++++++++++++++++-----
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index dc85bbcd..28606668 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -329,6 +329,8 @@ struct mi_heap_s {
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
 };
diff --git a/src/init.c b/src/init.c
index cfb44ecf..2e94935c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,6 +97,7 @@ const mi_heap_t _mi_heap_empty = {
   { 0, 0 },         // keys
   { {0}, {0}, 0 },
   0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
   false
 };
@@ -131,6 +132,7 @@ mi_heap_t _mi_heap_main = {
   { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
   { {0x846ca68b}, {0}, 0 },  // random
   0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
   false             // can reclaim
 };
diff --git a/src/page.c b/src/page.c
index ef8a69e5..2903b258 100644
--- a/src/page.c
+++ b/src/page.c
@@ -380,7 +380,8 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    (4*MI_SMALL_SIZE_MAX)
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX  
+#define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
@@ -405,7 +406,13 @@ void _mi_page_retire(mi_page_t* page) {
   if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 16;
+      page->retire_expire = MI_RETIRE_CYCLES;
+      mi_heap_t* heap = mi_page_heap(page);
+      mi_assert_internal(pq >= heap->pages);
+      const size_t index = pq - heap->pages;
+      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
+      if (index < heap->page_retired_min) heap->page_retired_min = index;
+      if (index > heap->page_retired_max) heap->page_retired_max = index;
       mi_assert_internal(mi_page_all_free(page));
       return; // dont't free after all
     }
@@ -415,22 +422,32 @@ void _mi_page_retire(mi_page_t* page) {
 }
 
 // free retired pages: we don't need to look at the entire queues
-// since we only retire pages that are the last one in a queue.
+// since we only retire pages that are at the head position in a queue.
 void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
-  for(mi_page_queue_t* pq = heap->pages; pq->block_size <= MI_MAX_RETIRE_SIZE; pq++) {
-    mi_page_t* page = pq->first;
+  size_t min = MI_BIN_FULL;
+  size_t max = 0;
+  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &heap->pages[bin];
+    mi_page_t*       page = pq->first;
     if (page != NULL && page->retire_expire != 0) {
       if (mi_page_all_free(page)) {
         page->retire_expire--;
         if (force || page->retire_expire == 0) {
           _mi_page_free(pq->first, pq, force);
         }
+        else {
+          // keep retired, update min/max
+          if (bin < min) min = bin;
+          if (bin > max) max = bin;
+        }
       }
       else {
         page->retire_expire = 0;
       }
     }
   }
+  heap->page_retired_min = min;
+  heap->page_retired_max = max;
 }