From cf89fc6338945fb9e1be4532cc9e98c58b77f121 Mon Sep 17 00:00:00 2001
From: Biswapriyo Nath <nathbappai@gmail.com>
Date: Fri, 18 Feb 2022 12:06:08 +0530
Subject: [PATCH 01/17] Fix strict function prototype warnings

Fix warning: function declaration isn't a prototype [-Wstrict-prototypes]
In C int foo() and int foo(void) are different functions.
---
 src/os.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/os.c b/src/os.c
index 757e8cab..e458946c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -107,7 +107,7 @@ bool _mi_os_has_overcommit(void) {
 }
 
 // OS (small) page size
-size_t _mi_os_page_size() {
+size_t _mi_os_page_size(void) {
   return os_page_size;
 }
 
@@ -159,7 +159,7 @@ static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
 
-static bool mi_win_enable_large_os_pages()
+static bool mi_win_enable_large_os_pages(void)
 {
   if (large_os_page_size > 0) return true;
 
@@ -230,7 +230,7 @@ void _mi_os_init(void)
   }
 }
 #elif defined(__wasi__)
-void _mi_os_init() {
+void _mi_os_init(void) {
   os_overcommit = false;
   os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   os_alloc_granularity = 16;
@@ -261,7 +261,7 @@ static void os_detect_overcommit(void) {
 #endif
 }
 
-void _mi_os_init() {
+void _mi_os_init(void) {
   // get the page size
   long result = sysconf(_SC_PAGESIZE);
   if (result > 0) {
@@ -1302,7 +1302,7 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 #ifdef _WIN32  
-static size_t mi_os_numa_nodex() {
+static size_t mi_os_numa_nodex(void) {
   USHORT numa_node = 0;
   if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
     // Extended API is supported

From 1f8138a4dc08c906e34b047f4e462d1c292a35ee Mon Sep 17 00:00:00 2001
From: Roman Gershman <romange@gmail.com>
Date: Mon, 28 Feb 2022 09:25:56 +0200
Subject: [PATCH 02/17] Fix wrong reporting of area used bytes

Fixes #552
---
 src/heap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heap.c b/src/heap.c
index d560fbc6..7cb4e5d1 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -530,7 +530,7 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   xarea.area.reserved = page->reserved * bsize;
   xarea.area.committed = page->capacity * bsize;
   xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
-  xarea.area.used = page->used;
+  xarea.area.used = page->used * bsize;
   xarea.area.block_size = bsize;
   return fun(heap, &xarea, arg);
 }

From c027c27c0aad8d0389e1b0f1fbc01c789b3835ae Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Wed, 30 Mar 2022 18:45:22 +0100
Subject: [PATCH 03/17] update the docs to clarify the proper C++ override
 usage.

---
 docs/overrides.html | 2 +-
 docs/using.html     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/overrides.html b/docs/overrides.html
index 0e7fd0ec..fc0ad591 100644
--- a/docs/overrides.html
+++ b/docs/overrides.html
@@ -118,7 +118,7 @@ $(document).ready(function(){initNavTree('overrides.html',''); initResizable();
 <h3>Windows</h3>
 <p>Overriding on Windows is robust and has the particular advantage to be able to redirect all malloc/free calls that go through the (dynamic) C runtime allocator, including those from other DLL's or libraries.</p>
 <p>The overriding on Windows requires that you link your program explicitly with the mimalloc DLL and use the C-runtime library as a DLL (using the <code>/MD</code> or <code>/MDd</code> switch). Also, the <code>mimalloc-redirect.dll</code> (or <code>mimalloc-redirect32.dll</code>) must be available in the same folder as the main <code>mimalloc-override.dll</code> at runtime (as it is a dependency). The redirection DLL ensures that all calls to the C runtime malloc API get redirected to mimalloc (in <code>mimalloc-override.dll</code>).</p>
-<p>To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some call to the mimalloc API in the <code>main</code> function, like <code>mi_version()</code> (or use the <code>/INCLUDE:mi_version</code> switch on the linker). See the <code>mimalloc-override-test</code> project for an example on how to use this. For best performance on Windows with C++, it is also recommended to also override the <code>new</code>/<code>delete</code> operations (by including <a href="https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h"><code>mimalloc-new-delete.h</code></a> a single(!) source file in your project).</p>
+<p>To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some call to the mimalloc API in the <code>main</code> function, like <code>mi_version()</code> (or use the <code>/INCLUDE:mi_version</code> switch on the linker). See the <code>mimalloc-override-test</code> project for an example on how to use this. For best performance on Windows with C++, it is also recommended to also override the <code>new</code>/<code>delete</code> operations (by including <a href="https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h"><code>mimalloc-new-delete.h</code></a> a single(!) source file in your project without linking to the mimalloc library).</p>
 <p>The environment variable <code>MIMALLOC_DISABLE_REDIRECT=1</code> can be used to disable dynamic overriding at run-time. Use <code>MIMALLOC_VERBOSE=1</code> to check if mimalloc was successfully redirected.</p>
 <p>(Note: in principle, it is possible to even patch existing executables without any recompilation if they are linked with the dynamic C runtime (<code>ucrtbase.dll</code>) &ndash; just put the <code>mimalloc-override.dll</code> into the import table (and put <code>mimalloc-redirect.dll</code> in the same folder) Such patching can be done for example with <a href="https://ntcore.com/?page_id=388">CFF Explorer</a>).</p>
 <h2>Static override</h2>
diff --git a/docs/using.html b/docs/using.html
index e6aad1a2..140f0c5c 100644
--- a/docs/using.html
+++ b/docs/using.html
@@ -105,7 +105,7 @@ $(document).ready(function(){initNavTree('using.html',''); initResizable(); });
 </div><!-- fragment --><p> to link with the shared (dynamic) library, or: </p><div class="fragment"><div class="line">target_link_libraries(myapp PUBLIC mimalloc-<span class="keyword">static</span>)</div>
 </div><!-- fragment --><p> to link with the static library. See <code>test\CMakeLists.txt</code> for an example.</p>
 <h3>C++</h3>
-<p>For best performance in C++ programs, it is also recommended to override the global <code>new</code> and <code>delete</code> operators. For convience, mimalloc provides <a href="https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h"><code>mimalloc-new-delete.h</code></a> which does this for you &ndash; just include it in a single(!) source file in your project.</p>
+<p>For best performance in C++ programs, it is also recommended to override the global <code>new</code> and <code>delete</code> operators. For convience, mimalloc provides <a href="https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h"><code>mimalloc-new-delete.h</code></a> which does this for you &ndash; just include it in a single(!) source file in your project without linking to the mimalloc's library.</p>
 <p>In C++, mimalloc also provides the <code><a class="el" href="group__cpp.html#structmi__stl__allocator" title="std::allocator implementation for mimalloc for use in STL containers.">mi_stl_allocator</a></code> struct which implements the <code>std::allocator</code> interface. For example: </p><div class="fragment"><div class="line">std::vector&lt;some_struct, mi_stl_allocator&lt;some_struct&gt;&gt; vec;</div>
 <div class="line">vec.push_back(some_struct());</div>
 </div><!-- fragment --><h3>Statistics</h3>

From 3c7ce7d3c6cd6a71db65cdd06881f45ab0848078 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Thu, 7 Apr 2022 19:09:31 -0700
Subject: [PATCH 04/17] improve mi_realloc codepath

---
 include/mimalloc-internal.h |  6 ++--
 src/alloc.c                 | 59 ++++++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 16be1251..79adc231 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -138,8 +138,8 @@ mi_msecs_t  _mi_clock_start(void);
 
 // "alloc.c"
 void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
-void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero);
-void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero);
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
@@ -945,7 +945,7 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
   void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
   const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
-  memcpy(adst, asrc, n);
+  _mi_memcpy(adst, asrc, n);
 }
 #else
 // Default fallback on `_mi_memcpy`
diff --git a/src/alloc.c b/src/alloc.c
index 5f150f24..62e76e23 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -147,7 +147,7 @@ mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
   return p;
 }
 
-void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
+void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
   void* p = mi_heap_malloc(heap,size);
   if (zero && p != NULL) {
     _mi_block_zero_init(_mi_ptr_page(p),p,size);  // todo: can we avoid getting the page again?
@@ -530,20 +530,25 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 }
 
 // Bytes available in a block
-static size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p,msg);
-  if (segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  const mi_block_t* block = (const mi_block_t*)p;
-  if (mi_unlikely(mi_page_has_aligned(page))) {
-    block = _mi_page_ptr_unalign(segment, page, p);
-    size_t size = mi_page_usable_size_of(page, block);
-    ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)block;
-    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-    return (size - adjust);
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  if (segment==NULL) return 0;  // also returns 0 if `p == NULL`
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);  
+  if (mi_likely(!mi_page_has_aligned(page))) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
   }
   else {
-    return mi_page_usable_size_of(page, block);
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
   }
 }
 
@@ -612,35 +617,43 @@ mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
   return mi_heap_mallocn(mi_get_default_heap(),count,size);
 }
 
-// Expand in place or fail
+// Expand (or shrink) in place (or fail)
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  #if MI_PADDING
+  // we do not shrink/expand with padding enabled 
+  MI_UNUSED(p); MI_UNUSED(newsize);
+  return NULL;
+  #else
   if (p == NULL) return NULL;
-  size_t size = _mi_usable_size(p,"mi_expand");
+  const size_t size = _mi_usable_size(p,"mi_expand");
   if (newsize > size) return NULL;
   return p; // it fits
+  #endif
 }
 
-void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) {
-  if (p == NULL) return _mi_heap_malloc_zero(heap,newsize,zero);
-  size_t size = _mi_usable_size(p,"mi_realloc");
-  if (newsize <= size && newsize >= (size / 2)) {
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL
+  if (mi_unlikely(newsize <= size && newsize >= (size / 2))) {
+    // todo: adjust potential padding to reflect the new size?
     return p;  // reallocation still fits and not more than 50% waste
   }
   void* newp = mi_heap_malloc(heap,newsize);
   if (mi_likely(newp != NULL)) {
     if (zero && newsize > size) {
       // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
-      size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
       memset((uint8_t*)newp + start, 0, newsize - start);
     }
-    _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
-    mi_free(p); // only free if successful
+    if (mi_likely(p != NULL)) {
+      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free the original pointer if successful
+    }
   }
   return newp;
 }
 
 void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, false);
+  return _mi_heap_realloc_zero(heap, p, newsize, false);  
 }
 
 void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {

From 6e5788d076f78b25d19c1afbc3b9a3434c38dcfa Mon Sep 17 00:00:00 2001
From: daan <daan@effp.org>
Date: Thu, 7 Apr 2022 20:17:48 -0700
Subject: [PATCH 05/17] add small cache for thread metadata for programs that
 create/destroy many OS threads

---
 src/init.c | 86 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 15 deletions(-)

diff --git a/src/init.c b/src/init.c
index 854a2228..72c56b42 100644
--- a/src/init.c
+++ b/src/init.c
@@ -165,6 +165,68 @@ typedef struct mi_thread_data_s {
   mi_tld_t   tld;
 } mi_thread_data_t;
 
+
+// Thread meta-data is allocated directly from the OS. For
+// some programs that do not use thread pools and allocate and
+// destroy many OS threads, this may causes too much overhead 
+// per thread so we maintain a small cache of recently freed metadata.
+
+#define TD_CACHE_SIZE (8)
+static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
+
+static mi_thread_data_t* mi_thread_data_alloc(void) {
+  // try to find thread metadata in the cache
+  mi_thread_data_t* td;
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    if (td != NULL) {
+      mi_thread_data_t* expected = td;
+      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, NULL)) {
+        return td;
+      }
+    }
+  }
+  // if that fails, allocate directly from the OS
+  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+  if (td == NULL) {
+    // if this fails, try once more. (issue #257)
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+    if (td == NULL) {
+      // really out of memory
+      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+    }
+  }
+  return td;
+}
+
+static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
+  // try to add the thread metadata to the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    if (td == NULL) {
+      mi_thread_data_t* expected = NULL;
+      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, tdfree)) {
+        return;
+      }
+    }
+  }
+  // if that fails, just free it directly
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+}
+
+static void mi_thread_data_collect(void) {
+  // free all thread metadata from the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    if (td != NULL) {
+      mi_thread_data_t* expected = td;
+      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, NULL)) {
+        _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
+      }
+    }
+  }
+}
+
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
   if (mi_heap_is_initialized(mi_get_default_heap())) return true;
@@ -177,16 +239,9 @@ static bool _mi_heap_init(void) {
   }
   else {
     // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main); // Todo: more efficient allocation?
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-        return false;
-      }
-    }
+    mi_thread_data_t* td = mi_thread_data_alloc();
+    if (td == NULL) return false;
+
     // OS allocated so already zero initialized
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
@@ -242,16 +297,17 @@ static bool _mi_heap_done(mi_heap_t* heap) {
   // free if not the main thread
   if (heap != &_mi_heap_main) {
     mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
+    mi_thread_data_free((mi_thread_data_t*)heap);
   }
-#if 0  
-  // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
-  // there may still be delete/free calls after the mi_fls_done is called. Issue #207
   else {
+    mi_thread_data_collect(); // free cached thread metadata  
+    #if 0  
+    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
     _mi_heap_destroy_pages(heap);
     mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    #endif
   }
-#endif
   return false;
 }
 

From 185f296513da2bfcdb413f7c4821bf238c35e035 Mon Sep 17 00:00:00 2001
From: daan <daan@effp.org>
Date: Thu, 7 Apr 2022 20:26:35 -0700
Subject: [PATCH 06/17] improve atomic operations for the thread metadata cache

---
 src/init.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/init.c b/src/init.c
index 72c56b42..32523abf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -178,10 +178,10 @@ static mi_thread_data_t* mi_thread_data_alloc(void) {
   // try to find thread metadata in the cache
   mi_thread_data_t* td;
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td != NULL) {
-      mi_thread_data_t* expected = td;
-      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, NULL)) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); 
+      if (td != NULL) {
         return td;
       }
     }
@@ -202,10 +202,10 @@ static mi_thread_data_t* mi_thread_data_alloc(void) {
 static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
   // try to add the thread metadata to the cache
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td == NULL) {
       mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, tdfree)) {
+      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
         return;
       }
     }
@@ -217,10 +217,10 @@ static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
 static void mi_thread_data_collect(void) {
   // free all thread metadata from the cache
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t*, &td_cache[i]);
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td != NULL) {
-      mi_thread_data_t* expected = td;
-      if (mi_atomic_cas_weak_acq_rel(&td_cache[i], &expected, NULL)) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
         _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
       }
     }

From b7677b6f8482daad8374b2cf5430be2156308063 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 8 Apr 2022 14:09:38 -0700
Subject: [PATCH 07/17] fix atomic warnings on clang14 (issue #571)

---
 include/mimalloc-atomic.h |  8 +++++++-
 src/init.c                | 10 +++++-----
 src/os.c                  |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index e07df84d..7ad5da58 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -23,10 +23,15 @@ terms of the MIT license. A copy of the license can be found in the file
 #define  _Atomic(tp)            std::atomic<tp>
 #define  mi_atomic(name)        std::atomic_##name
 #define  mi_memory_order(name)  std::memory_order_##name
+#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
 #define  _Atomic(tp)            tp 
-#define  ATOMIC_VAR_INIT(x)     x
+#define  MI_ATOMIC_VAR_INIT(x)  x
 #define  mi_atomic(name)        mi_atomic_##name
 #define  mi_memory_order(name)  mi_memory_order_##name
 #else
@@ -34,6 +39,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdatomic.h>
 #define  mi_atomic(name)        atomic_##name
 #define  mi_memory_order(name)  memory_order_##name
+#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
 #endif
 
 // Various defines for all used memory orders in mimalloc
diff --git a/src/init.c b/src/init.c
index 854a2228..ad1e4d45 100644
--- a/src/init.c
+++ b/src/init.c
@@ -25,8 +25,8 @@ const mi_page_t _mi_page_empty = {
   0,       // used
   0,       // xblock_size
   NULL,    // local_free
-  ATOMIC_VAR_INIT(0), // xthread_free
-  ATOMIC_VAR_INIT(0), // xheap
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xheap
   NULL, NULL
 };
 
@@ -91,7 +91,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  ATOMIC_VAR_INIT(NULL),
+  MI_ATOMIC_VAR_INIT(NULL),
   0,                // tid
   0,                // cookie
   { 0, 0 },         // keys
@@ -123,7 +123,7 @@ mi_heap_t _mi_heap_main = {
   &tld_main,
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  ATOMIC_VAR_INIT(NULL),
+  MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
   0,                // initial cookie
   { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
@@ -325,7 +325,7 @@ bool _mi_is_main_thread(void) {
   return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = ATOMIC_VAR_INIT(1);
+static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
 
 size_t  _mi_current_thread_count(void) {
   return mi_atomic_load_relaxed(&thread_count);
diff --git a/src/os.c b/src/os.c
index 52939faa..f36f8480 100644
--- a/src/os.c
+++ b/src/os.c
@@ -983,7 +983,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   if (p != start) return false;
 #else
 #if defined(MADV_FREE)
-  static _Atomic(size_t) advice = ATOMIC_VAR_INIT(MADV_FREE);
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
   int err;
   while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };

From e18a8cd72eeb024e64d2929948cc9bd8a4d9bf79 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 8 Apr 2022 16:58:32 -0700
Subject: [PATCH 08/17] add heap walk test

---
 include/mimalloc.h          |   1 +
 src/heap.c                  |   9 +-
 test/main-override-static.c | 205 +++++++++++++++++++++++++++++++++++-
 3 files changed, 209 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 91ad352b..c9d48c0e 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -256,6 +256,7 @@ typedef struct mi_heap_area_s {
   size_t committed;   // current available bytes for this area
   size_t used;        // number of allocated blocks
   size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
 } mi_heap_area_t;
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
diff --git a/src/heap.c b/src/heap.c
index 7cb4e5d1..f4654464 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -470,13 +470,14 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   if (page->used == 0) return true;
 
   const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
   uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
 
   if (page->capacity == 1) {
     // optimize page with one block
     mi_assert_internal(page->used == 1 && page->free == NULL);
-    return visitor(mi_page_heap(page), area, pstart, bsize, arg);
+    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
   }
 
   // create a bitmap of free blocks.
@@ -510,7 +511,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
     else if ((m & ((uintptr_t)1 << bit)) == 0) {
       used_count++;
       uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, bsize, arg)) return false;
+      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
     }
   }
   mi_assert_internal(page->used == used_count);
@@ -526,12 +527,14 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
   mi_heap_area_ex_t xarea;
   const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
   xarea.page = page;
   xarea.area.reserved = page->reserved * bsize;
   xarea.area.committed = page->capacity * bsize;
   xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
   xarea.area.used = page->used * bsize;
-  xarea.area.block_size = bsize;
+  xarea.area.block_size = ubsize;
+  xarea.area.full_block_size = bsize;
   return fun(heap, &xarea, arg);
 }
 
diff --git a/test/main-override-static.c b/test/main-override-static.c
index 071e4248..e64b987b 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -16,6 +16,9 @@ static void test_aslr(void);
 static void test_process_info(void);
 static void test_reserved(void);
 static void negative_stat(void);
+static void alloc_huge(void);
+static void test_heap_walk(void);
+
 
 int main() {
   mi_version();
@@ -29,6 +32,8 @@ int main() {
   // invalid_free();
   // test_reserved();
   // negative_stat();
+  test_heap_walk();
+  // alloc_huge();
   
   void* p1 = malloc(78);
   void* p2 = malloc(24);
@@ -48,8 +53,10 @@ int main() {
   //free(p1);
   //p2 = malloc(32);
   //mi_free(p2);
-  mi_collect(true);
-  mi_stats_print(NULL);
+  
+  //mi_collect(true);
+  //mi_stats_print(NULL);
+  
   // test_process_info();
   return 0;
 }
@@ -179,4 +186,196 @@ static void negative_stat(void) {
   *p = 100;
   mi_free(p);
   mi_stats_print_out(NULL, NULL);  
-}
\ No newline at end of file
+}
+
+static void alloc_huge(void) {
+  void* p = mi_malloc(67108872);
+  mi_free(p);
+}
+
+static bool test_visit(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  printf("I'm visiting a block of size %zu, allocated size %zu\n", block_size, mi_usable_size(block));
+  return true;
+}
+
+static void test_heap_walk(void) {
+  mi_heap_t* heap = mi_heap_new();
+  //mi_heap_malloc(heap, 2097152);
+  mi_heap_malloc(heap, 2067152);
+  mi_heap_malloc(heap, 2097160);
+  mi_heap_malloc(heap, 24576);
+  mi_heap_visit_blocks(heap, true, &test_visit, NULL);
+}
+
+// ----------------------------
+// bin size experiments
+// ------------------------------
+
+#if 0
+#include <stdint.h>
+#include <stdbool.h>
+
+#define MI_INTPTR_SIZE 8
+#define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)
+
+#define MI_BIN_HUGE 100
+//#define MI_ALIGN2W
+
+// Bit scan reverse: return the index of the highest bit.
+static inline uint8_t mi_bsr32(uint32_t x);
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#include <intrin.h>
+static inline uint8_t mi_bsr32(uint32_t x) {
+  uint32_t idx;
+  _BitScanReverse((DWORD*)&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static inline uint8_t mi_bsr32(uint32_t x) {
+  return (31 - __builtin_clz(x));
+}
+#else
+static inline uint8_t mi_bsr32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
+     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
+  };
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return debruijn[(x*0x076be629) >> 27];
+}
+#endif
+
+/*
+// Bit scan reverse: return the index of the highest bit.
+uint8_t _mi_bsr(uintptr_t x) {
+  if (x == 0) return 0;
+  #if MI_INTPTR_SIZE==8
+  uint32_t hi = (x >> 32);
+  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
+  #elif MI_INTPTR_SIZE==4
+  return mi_bsr32(x);
+  #else
+  # error "define bsr for non-32 or 64-bit platforms"
+  #endif
+}
+*/
+
+
+static inline size_t _mi_wsize_from_size(size_t size) {
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+extern inline uint8_t _mi_bin8(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+#if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+#endif
+  else if (wsize > MI_LARGE_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+#if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+#endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = mi_bsr32((uint32_t)wsize);
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+  }
+  return bin;
+}
+
+static inline uint8_t _mi_bin4(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+#if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+#else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+#endif
+  else if (wsize > MI_LARGE_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    uint8_t b = mi_bsr32((uint32_t)wsize);
+    bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
+  }
+  return bin;
+}
+
+static size_t _mi_binx4(size_t bsize) {
+  if (bsize==0) return 0;
+  uint8_t b = mi_bsr32((uint32_t)bsize);
+  if (b <= 1) return bsize;
+  size_t bin = ((b << 1) | (bsize >> (b - 1))&0x01);
+  return bin;
+}
+
+static size_t _mi_binx8(size_t bsize) {
+  if (bsize<=1) return bsize;
+  uint8_t b = mi_bsr32((uint32_t)bsize);
+  if (b <= 2) return bsize;
+  size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5;
+  return bin;
+}
+
+static void mi_bins(void) {
+  //printf("  QNULL(1), /* 0 */ \\\n  ");
+  size_t last_bin = 0;
+  size_t min_bsize = 0;
+  size_t last_bsize = 0;
+  for (size_t bsize = 1; bsize < 2*1024; bsize++) {
+    size_t size = bsize * 64 * 1024;
+    size_t bin = _mi_binx8(bsize);
+    if (bin != last_bin) {
+      printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_bsize, last_bsize, last_bin);
+      //printf("QNULL(%6zd), ", wsize);
+      //if (last_bin%8 == 0) printf("/* %i */ \\\n  ", last_bin);
+      last_bin = bin;
+      min_bsize = bsize;
+    }
+    last_bsize = bsize;
+  }
+}
+#endif
+

From adc8b3187c4ec19fe4e8e8bb2f5aae4e9eb4bba3 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 13:48:16 -0700
Subject: [PATCH 09/17] fix Windows C++ compilation in combination with dynamic
 overriding by preferring RtlGenRandom

---
 src/random.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/random.c b/src/random.c
index 5057a623..d474a53a 100644
--- a/src/random.c
+++ b/src/random.c
@@ -168,16 +168,10 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 #if defined(_WIN32)
 
-#if !defined(MI_USE_RTLGENRANDOM)
-// We prefer to use BCryptGenRandom instead of RtlGenRandom but it can lead to a deadlock 
-// under the VS debugger when using dynamic overriding.
-#pragma comment (lib,"bcrypt.lib")
-#include <bcrypt.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
-}
-#else
-// Use (unofficial) RtlGenRandom
+#if defined(MI_USE_RTLGENRANDOM) || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using 
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and 
+// sometimes deadlocks when also running under the VS debugger.
 #pragma comment (lib,"advapi32.lib")
 #define RtlGenRandom  SystemFunction036
 #ifdef __cplusplus
@@ -190,6 +184,12 @@ BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
 static bool os_random_buf(void* buf, size_t buf_len) {
   return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
 }
+#else
+#pragma comment (lib,"bcrypt.lib")
+#include <bcrypt.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+}
 #endif
 
 #elif defined(__APPLE__)

From 016b2ad5352de399af167a1acb6042cee31f8fbe Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 14:08:27 -0700
Subject: [PATCH 10/17] nicer heap walk test

---
 test/main-override-static.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/main-override-static.c b/test/main-override-static.c
index e64b987b..a5088d3a 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -194,7 +194,12 @@ static void alloc_huge(void) {
 }
 
 static bool test_visit(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
-  printf("I'm visiting a block of size %zu, allocated size %zu\n", block_size, mi_usable_size(block));
+  if (block == NULL) {
+    printf("visiting an area with blocks of size %zu (including padding)\n", area->full_block_size);
+  }
+  else {
+    printf("  block of size %zu (allocated size is %zu)\n", block_size, mi_usable_size(block));
+  }
   return true;
 }
 

From 6c91c75b140f650845e918005979a05ac0da7d5e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 14:33:20 -0700
Subject: [PATCH 11/17] remove thread local segment cache

---
 include/mimalloc-types.h |  3 --
 include/mimalloc.h       |  2 +-
 src/init.c               |  2 +-
 src/options.c            |  2 +-
 src/segment.c            | 91 ++++------------------------------------
 5 files changed, 10 insertions(+), 90 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index be3cf503..a3fad92d 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -521,9 +521,6 @@ typedef struct mi_segments_tld_s {
   size_t              peak_count;   // peak number of segments
   size_t              current_size; // current size of all segments
   size_t              peak_size;    // peak size of all segments
-  size_t              cache_count;  // number of segments in the cache
-  size_t              cache_size;   // total size of all segments in the cache
-  mi_segment_t*       cache;        // (small) cache of segments
   mi_stats_t*         stats;        // points to tld stats
   mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;
diff --git a/include/mimalloc.h b/include/mimalloc.h
index c9d48c0e..2707bc16 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -313,7 +313,7 @@ typedef enum mi_option_e {
   mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
   mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
   mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
-  mi_option_segment_cache,             
+  mi_option_deprecated_segment_cache,             
   mi_option_page_reset,               
   mi_option_abandoned_page_reset,     
   mi_option_segment_reset,
diff --git a/src/init.c b/src/init.c
index ce43e3a6..f910c287 100644
--- a/src/init.c
+++ b/src/init.c
@@ -112,7 +112,7 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0, 0, NULL,
+    0, 0, 0, 0,
     &tld_main.stats, &tld_main.os
   }, // segments
   { 0, &tld_main.stats },  // os
diff --git a/src/options.c b/src/options.c
index e1944a19..4f857ec6 100644
--- a/src/options.c
+++ b/src/options.c
@@ -78,7 +78,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
   { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
   { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
-  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
   { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
diff --git a/src/segment.c b/src/segment.c
index 8a83ceed..a98edcfd 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -110,17 +110,6 @@ static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_t
  Invariant checking
 ----------------------------------------------------------- */
 
-#if (MI_DEBUG>=2)
-static bool mi_segment_is_in_free_queue(const mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld);
-  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
-  if (in_queue) {
-    mi_assert_expensive(mi_segment_queue_contains(queue, segment));
-  }
-  return in_queue;
-}
-#endif
-
 static size_t mi_segment_page_size(const mi_segment_t* segment) {
   if (segment->capacity > 1) {
     mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
@@ -483,64 +472,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
 }
 
-
-// The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use,
-#define MI_SEGMENT_CACHE_FRACTION (8)
-
-// note: returned segment may be partially reset
-static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t* tld) {
-  if (segment_size != 0 && segment_size != MI_SEGMENT_SIZE) return NULL;
-  mi_segment_t* segment = tld->cache;
-  if (segment == NULL) return NULL;
-  tld->cache_count--;
-  tld->cache = segment->next;
-  segment->next = NULL;
-  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  _mi_stat_decrease(&tld->stats->segments_cache, 1);
-  return segment;
-}
-
-static bool mi_segment_cache_full(mi_segments_tld_t* tld)
-{
-  // if (tld->count == 1 && tld->cache_count==0) return false; // always cache at least the final segment of a thread
-  size_t max_cache = mi_option_get(mi_option_segment_cache);
-  if (tld->cache_count < max_cache
-       && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION)) // at least allow a 1 element cache
-     ) {
-    return false;
-  }
-  // take the opportunity to reduce the segment cache if it is too large (now)
-  // TODO: this never happens as we check against peak usage, should we use current usage instead?
-  while (tld->cache_count > max_cache) { //(1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
-    mi_segment_t* segment = mi_segment_cache_pop(0,tld);
-    mi_assert_internal(segment != NULL);
-    if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld);
-  }
-  return true;
-}
-
-static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
-  mi_assert_internal(segment->next == NULL);
-  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
-    return false;
-  }
-  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  segment->next = tld->cache;
-  tld->cache = segment;
-  tld->cache_count++;
-  _mi_stat_increase(&tld->stats->segments_cache,1);
-  return true;
-}
-
 // called by threads that are terminating to free cached segments
-void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
-  mi_segment_t* segment;
-  while ((segment = mi_segment_cache_pop(0,tld)) != NULL) {
-    mi_segment_os_free(segment, segment->segment_size, tld);
-  }
-  mi_assert_internal(tld->cache_count == 0);
-  mi_assert_internal(tld->cache == NULL);
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {  
 #if MI_DEBUG>=2
   if (!_mi_is_main_thread()) {
     mi_assert_internal(tld->pages_reset.first == NULL);
@@ -712,13 +645,8 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   mi_assert(segment->prev == NULL);
   _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);
 
-  if (!force && mi_segment_cache_push(segment, tld)) {
-    // it is put in our cache
-  }
-  else {
-    // otherwise return it to the OS
-    mi_segment_os_free(segment, segment->segment_size, tld);
-  }
+  // return it to the OS
+  mi_segment_os_free(segment, segment->segment_size, tld);
 }
 
 /* -----------------------------------------------------------
@@ -1217,15 +1145,10 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
 {
   mi_assert_internal(page_kind <= MI_PAGE_LARGE);
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
-  // 1. try to get a segment from our cache
-  mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
-  if (segment != NULL) {
-    mi_segment_init(segment, 0, page_kind, page_shift, tld, os_tld);
-    return segment;
-  }
-  // 2. try to reclaim an abandoned segment
+  
+  // 1. try to reclaim an abandoned segment
   bool reclaimed;
-  segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld);
   if (reclaimed) {
     // reclaimed the right page right into the heap
     mi_assert_internal(segment != NULL && segment->page_kind == page_kind && page_kind <= MI_PAGE_LARGE);
@@ -1235,7 +1158,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
     // reclaimed a segment with empty pages (of `page_kind`) in it
     return segment;
   }
-  // 3. otherwise allocate a fresh segment
+  // 2. otherwise allocate a fresh segment
   return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
 }
 

From faca422b718625e656012391d94c564f5a67d977 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 14:48:30 -0700
Subject: [PATCH 12/17] fix msvc warnings at level 4

---
 src/alloc-posix.c | 26 +++++++++++++-------------
 src/init.c        |  2 +-
 src/options.c     |  3 +++
 src/segment.c     |  7 ++++++-
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/alloc-posix.c b/src/alloc-posix.c
index ee5babe1..176e7ec3 100644
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@@ -32,17 +32,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
   //if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
-size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
   //if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
-size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
   return mi_good_size(size);
 }
 
@@ -65,24 +65,24 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   return 0;
 }
 
-mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
   void* p = mi_malloc_aligned(size, alignment);
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
   return p;
 }
 
-mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
   return mi_memalign( _mi_os_page_size(), size );
 }
 
-mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
   size_t psize = _mi_os_page_size();
   if (size >= SIZE_MAX - psize) return NULL; // overflow
   size_t asize = _mi_align_up(size, psize);
   return mi_malloc_aligned(asize, psize);
 }
 
-mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
   if (mi_unlikely((size&(alignment-1)) != 0)) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
@@ -95,13 +95,13 @@ mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_n
   return p;
 }
 
-void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
+mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
   void* newp = mi_reallocn(p,count,size);
   if (newp==NULL) { errno = ENOMEM; }
   return newp;
 }
 
-int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
+mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
   mi_assert(p != NULL);
   if (p == NULL) {
     errno = EINVAL;
@@ -120,7 +120,7 @@ void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
   return res;
 }
 
-mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
   if (s==NULL) return NULL;
   size_t len;
   for(len = 0; s[len] != 0; len++) { }
@@ -132,7 +132,7 @@ mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noex
   return p;
 }
 
-mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
   return (unsigned char*)mi_strdup((const char*)s);
 }
 
@@ -172,10 +172,10 @@ int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name)
 #endif
 }
 
-void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
+mi_decl_nodiscard void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
   return mi_recalloc_aligned_at(p, newcount, size, alignment, offset);
 }
 
-void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
+mi_decl_nodiscard void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
   return mi_recalloc_aligned(p, newcount, size, alignment);
 }
diff --git a/src/init.c b/src/init.c
index f910c287..e8913818 100644
--- a/src/init.c
+++ b/src/init.c
@@ -458,7 +458,7 @@ bool _mi_preloading(void) {
   return os_preloading;
 }
 
-bool mi_is_redirected(void) mi_attr_noexcept {
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
   return mi_redirected;
 }
 
diff --git a/src/options.c b/src/options.c
index 4f857ec6..8700fc76 100644
--- a/src/options.c
+++ b/src/options.c
@@ -116,6 +116,7 @@ void _mi_options_init(void) {
 
 mi_decl_nodiscard long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return 0;
   mi_option_desc_t* desc = &options[option];
   mi_assert(desc->option == option);  // index should match the option
   if (mi_unlikely(desc->init == UNINIT)) {
@@ -126,6 +127,7 @@ mi_decl_nodiscard long mi_option_get(mi_option_t option) {
 
 void mi_option_set(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
   mi_option_desc_t* desc = &options[option];
   mi_assert(desc->option == option);  // index should match the option
   desc->value = value;
@@ -134,6 +136,7 @@ void mi_option_set(mi_option_t option, long value) {
 
 void mi_option_set_default(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
   mi_option_desc_t* desc = &options[option];
   if (desc->init != INITIALIZED) {
     desc->value = value;
diff --git a/src/segment.c b/src/segment.c
index a98edcfd..bd36f627 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -191,7 +191,10 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
     mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
     mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0);
     mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect);
-    if (MI_SECURE <= 1 || segment->capacity == 1) {
+    #if (MI_SECURE >= 2)
+    if (segment->capacity == 1) 
+    #endif 
+    {
       // and protect the last (or only) page too
       mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE);
       uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize;
@@ -207,6 +210,7 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
         mi_segment_protect_range(start, os_psize, protect);
       }
     }
+    #if (MI_SECURE >= 2)
     else {
       // or protect every page
       const size_t page_size = mi_segment_page_size(segment);
@@ -216,6 +220,7 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
         }
       }
     }
+    #endif  
   }
 }
 

From c8258514043176929b93113518d961db5b094864 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 15:59:05 -0700
Subject: [PATCH 13/17] define MEM_EXTENDED_PARAMETER structure ourselves on
 Windows in order to compile with older SDK's

---
 src/os.c | 73 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/src/os.c b/src/os.c
index 860de82e..6a303a85 100644
--- a/src/os.c
+++ b/src/os.c
@@ -141,20 +141,41 @@ size_t _mi_os_good_alloc_size(size_t size) {
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
-//
-// We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE; 
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
 #include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
 // Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
-#if (_WIN32_WINNT < 0x601)  // before Win7
-typedef struct _PROCESSOR_NUMBER { WORD Group; BYTE Number; BYTE Reserved; } PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
-#endif
-typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(PPROCESSOR_NUMBER ProcNumber);
-typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(PPROCESSOR_NUMBER Processor, PUSHORT NodeNumber);
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
@@ -348,20 +369,18 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
     }
   } 
 #endif
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
     reqs.Alignment = try_alignment;
-    MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
-    param.Type = MemExtendedParameterAddressRequirements;
-    param.Pointer = &reqs;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
     void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
     if (p != NULL) return p;
     _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
     // fall through on error
   }
-#endif
   // last resort
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
@@ -1109,21 +1128,17 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 
   mi_win_enable_large_os_pages();
 
-  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   static bool mi_huge_pages_available = true;
   if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
-    params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
-    params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
     ULONG param_count = 1;
     if (numa_node >= 0) {
       param_count++;
-      params[1].Type = MemExtendedParameterNumaNode;
-      params[1].ULong = (unsigned)numa_node;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
     }
     SIZE_T psize = size;
     void* base = addr;
@@ -1139,13 +1154,11 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   }
   // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
   if (pVirtualAlloc2 != NULL && numa_node >= 0) {
-    params[0].Type = MemExtendedParameterNumaNode;
-    params[0].ULong = (unsigned)numa_node;
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
-  #else
-    MI_UNUSED(numa_node);
-  #endif
+  
   // otherwise use regular virtual alloc on older windows
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
@@ -1299,7 +1312,7 @@ static size_t mi_os_numa_nodex(void) {
   USHORT numa_node = 0;
   if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
     // Extended API is supported
-    PROCESSOR_NUMBER pnum;
+    MI_PROCESSOR_NUMBER pnum;
     (*pGetCurrentProcessorNumberEx)(&pnum);
     USHORT nnode = 0;
     BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);

From 96bf3a803901aa6ca12690685e88ce3e41b2befa Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 16:16:11 -0700
Subject: [PATCH 14/17] fix warning

---
 src/os.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/os.c b/src/os.c
index 6a303a85..f6a3cc86 100644
--- a/src/os.c
+++ b/src/os.c
@@ -347,7 +347,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
   return !err;  
 }
 
-#if !(defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED))
+#if !(defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED) || MI_INTPTR_SIZE < 8)
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 #endif
 
@@ -691,7 +691,7 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
   if (hint%try_alignment != 0) return NULL;
   return (void*)hint;
 }
-#elif defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED)
+#elif defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED) || (MI_INTPTR_SIZE < 8)
 // no need for mi_os_get_aligned_hint
 #else
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {

From f4b7ea9e9ee5648975266c532e90ebfc9530045b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Apr 2022 16:20:27 -0700
Subject: [PATCH 15/17] fix compile warnings

---
 src/segment.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/segment.c b/src/segment.c
index bd36f627..75265073 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -110,6 +110,7 @@ static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_t
  Invariant checking
 ----------------------------------------------------------- */
 
+#if (MI_DEBUG >= 2) || (MI_SECURE >= 2)
 static size_t mi_segment_page_size(const mi_segment_t* segment) {
   if (segment->capacity > 1) {
     mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
@@ -120,7 +121,7 @@ static size_t mi_segment_page_size(const mi_segment_t* segment) {
     return segment->segment_size;
   }
 }
-
+#endif
 
 #if (MI_DEBUG>=2)
 static bool mi_pages_reset_contains(const mi_page_t* page, mi_segments_tld_t* tld) {
@@ -479,6 +480,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
 
 // called by threads that are terminating to free cached segments
 void _mi_segment_thread_collect(mi_segments_tld_t* tld) {  
+  MI_UNUSED_RELEASE(tld);
 #if MI_DEBUG>=2
   if (!_mi_is_main_thread()) {
     mi_assert_internal(tld->pages_reset.first == NULL);

From 2ab70f3c84a8ca66c4ffdbc19da0ad62c39765ef Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 10 Apr 2022 12:55:36 -0700
Subject: [PATCH 16/17] remove ifdefs around mi_os_aligned_hint

---
 src/os.c | 104 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 53 insertions(+), 51 deletions(-)

diff --git a/src/os.c b/src/os.c
index f6a3cc86..69eb49df 100644
--- a/src/os.c
+++ b/src/os.c
@@ -308,7 +308,57 @@ static int mi_madvise(void* addr, size_t length, int advice) {
 
 
 /* -----------------------------------------------------------
-  free memory
+  aligned hinting
+-------------------------------------------------------------- */
+
+// On 64-bit systems, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those.
+#if (MI_INTPTR_SIZE >= 8)
+static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
+
+// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// If this returns NULL, the OS will determine the address but on some OS's that may not be 
+// properly aligned which can be more costly as it needs to be adjusted afterwards.
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
+//  in the middle of the 2TiB - 6TiB address range (see issue #372))
+
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
+{
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
+  #if (MI_SECURE>0)
+  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
+  #endif
+
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
+    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
+    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    #endif
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
+  }
+  if (hint%try_alignment != 0) return NULL;
+  return (void*)hint;
+}
+#else
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
+  return NULL;
+}
+#endif
+
+/* -----------------------------------------------------------
+  Free memory
 -------------------------------------------------------------- */
 
 static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
@@ -347,9 +397,6 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
   return !err;  
 }
 
-#if !(defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED) || MI_INTPTR_SIZE < 8)
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-#endif
 
 /* -----------------------------------------------------------
   Raw allocation on Windows (VirtualAlloc) 
@@ -652,53 +699,6 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 }
 #endif
 
-// On 64-bit systems, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate them.
-#if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || defined(MI_OS_USE_MMAP))
-static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
-
-// Return a 4MiB aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be 
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) 
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#elif defined(__wasi__) || defined(MI_USE_SBRK) || defined(MAP_ALIGNED) || (MI_INTPTR_SIZE < 8)
-// no need for mi_os_get_aligned_hint
-#else
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
-  MI_UNUSED(try_alignment); MI_UNUSED(size);
-  return NULL;
-}
-#endif
 
 /* -----------------------------------------------------------
    Primitive allocation from the OS.
@@ -799,6 +799,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
   return p;
 }
 
+
 /* -----------------------------------------------------------
   OS API: alloc, free, alloc_aligned
 ----------------------------------------------------------- */
@@ -826,6 +827,7 @@ void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
 
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
 {
+  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
   MI_UNUSED(tld_stats);
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);

From 9afdf762a6f6909040903963b756f600cbea967e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 10 Apr 2022 12:55:59 -0700
Subject: [PATCH 17/17] fix c++ compilation warning for an unused parameter

---
 src/alloc-override.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index 12e9e0d6..e29cb4b2 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -166,8 +166,8 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
-  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
-  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   
   void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }