From d36d04b4a6e5ada99fa36447332e5d7d3b1d33be Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 31 Oct 2019 15:35:10 -0700
Subject: [PATCH 01/48] add arena for huge page management

---
 ide/vs2019/mimalloc-override.vcxproj |   1 +
 ide/vs2019/mimalloc.vcxproj          |   1 +
 include/mimalloc-internal.h          |   1 +
 src/arena.c                          | 369 +++++++++++++++++++++++++++
 src/memory.c                         |  80 ++++--
 src/os.c                             |   4 +-
 6 files changed, 435 insertions(+), 21 deletions(-)
 create mode 100644 src/arena.c
diff --git a/ide/vs2019/mimalloc-override.vcxproj b/ide/vs2019/mimalloc-override.vcxproj
index 96a8924f..09fd37fb 100644
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@@ -231,6 +231,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 28e96d71..1fabff5e 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -217,6 +217,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index ccf12a06..2b881ac9 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -57,6 +57,7 @@ void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocat
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
 
+
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
 void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
diff --git a/src/arena.c b/src/arena.c
new file mode 100644
index 00000000..5f33965a
--- /dev/null
+++ b/src/arena.c
@@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+// os.c
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
+void* _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
+int   _mi_os_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+#define MI_SEGMENT_ALIGN     MI_SEGMENT_SIZE
+#define MI_ARENA_BLOCK_SIZE  (4*MI_SEGMENT_ALIGN)  // 16MiB
+#define MI_MAX_ARENAS        (64)
+
+// Block info: bit 0 contains the `in_use` bit, the upper bits the
+// size in count of arena blocks.
+typedef uintptr_t mi_block_info_t;
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  uint8_t* start;                         // the start of the memory area
+  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     is_large;                      // large OS page allocated
+  _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
+  _Atomic(mi_block_info_t) blocks[1];     // `block_count` block info's
+} mi_arena_t;
+
+
+// The available arenas
+static _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static _Atomic(uintptr_t)   mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a memory id where the lower 8 bits are
+  the arena index +1, and the upper bits the block index.
+----------------------------------------------------------- */
+
+// Use `0` as a special id for direct OS allocated memory.
+#define MI_MEMID_OS   0
+
+static size_t mi_memid_create(size_t arena_index, size_t block_index) {
+  mi_assert_internal(arena_index < 0xFE);
+  return ((block_index << 8) | ((arena_index+1) & 0xFF));
+}
+
+static void mi_memid_indices(size_t memid, size_t* arena_index, size_t* block_index) {
+  mi_assert_internal(memid != MI_MEMID_OS);
+  *arena_index = (memid & 0xFF) - 1;
+  *block_index = (memid >> 8);
+}
+
+/* -----------------------------------------------------------
+  Block info
+----------------------------------------------------------- */
+
+static bool mi_block_is_in_use(mi_block_info_t info) {
+  return ((info&1) != 0);
+}
+
+static size_t mi_block_count(mi_block_info_t info) {
+  return (info>>1);
+}
+
+static mi_block_info_t mi_block_info_create(size_t bcount, bool in_use) {
+  return (((mi_block_info_t)bcount << 1) | (in_use ? 1 : 0));
+}
+
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+
+static void* mi_arena_allocx(mi_arena_t* arena, size_t start_idx, size_t end_idx, size_t needed_bcount, bool* is_zero, size_t* block_index)
+{
+  // Scan linearly through all block info's
+  // Skipping used ranges, coalescing free ranges on demand.
+  mi_assert_internal(needed_bcount > 0);
+  mi_assert_internal(start_idx <= arena->block_count);
+  mi_assert_internal(end_idx <= arena->block_count);
+  _Atomic(mi_block_info_t)* block = &arena->blocks[start_idx];
+  _Atomic(mi_block_info_t)* end = &arena->blocks[end_idx];
+  while (block < end) {
+    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
+    size_t bcount = mi_block_count(binfo);
+    if (mi_block_is_in_use(binfo)) {
+      // in-use, skip ahead
+      mi_assert_internal(bcount > 0);
+      block += bcount;
+    }
+    else {
+      // free blocks
+      if (bcount==0) {
+        // optimization:
+        // use 0 initialized blocks at the end, to use single atomic operation
+        // initially to reduce contention (as we don't need to split)
+        if (block + needed_bcount > end) {
+          return NULL; // does not fit
+        }
+        else if (!mi_atomic_cas_weak(block, mi_block_info_create(needed_bcount, true), binfo)) {
+          // ouch, someone else was quicker. Try again..
+          continue;
+        }
+        else {
+          // we got it: return a pointer to the claimed memory
+          ptrdiff_t idx = (block - arena->blocks);
+          *is_zero = arena->is_zero_init;
+          *block_index = idx;
+          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
+        }
+      }
+
+      mi_assert_internal(bcount>0);
+      if (needed_bcount > bcount) {
+#if 0 // MI_NO_ARENA_COALESCE
+        block += bcount; // too small, skip to the next range
+        continue;
+#else
+        // too small, try to coalesce
+        _Atomic(mi_block_info_t)* block_next = block + bcount;
+        if (block_next >= end) {
+          return NULL; // does not fit
+        }
+        mi_block_info_t binfo_next = mi_atomic_read(block_next);
+        size_t bcount_next = mi_block_count(binfo_next);
+        if (mi_block_is_in_use(binfo_next)) {
+          // next block is in use, cannot coalesce
+          block += (bcount + bcount_next); // skip ahea over both blocks
+        }
+        else {
+          // next block is free, try to coalesce
+          // first set the next one to being used to prevent dangling ranges
+          if (!mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, true), binfo_next)) {
+            // someone else got in before us.. try again
+            continue;
+          }
+          else {
+            if (!mi_atomic_cas_strong(block, mi_block_info_create(bcount + bcount_next, true), binfo)) {  // use strong to increase success chance
+              // someone claimed/coalesced the block in the meantime
+              // first free the next block again..
+              bool ok = mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, false), binfo_next); // must be strong
+              mi_assert(ok); UNUSED(ok);
+              // and try again
+              continue;
+            }
+            else {
+              // coalesced! try again
+              // todo: we could optimize here to immediately claim the block if the
+              // coalesced size is a fit instead of retrying. Keep it simple for now.
+              continue;
+            }
+          }
+        }
+#endif
+      }
+      else {  // needed_bcount <= bcount
+        mi_assert_internal(needed_bcount <= bcount);
+        // it fits, claim the whole block
+        if (!mi_atomic_cas_weak(block, mi_block_info_create(bcount, true), binfo)) {
+          // ouch, someone else was quicker. Try again..
+          continue;
+        }
+        else {
+          // got it, now split off the needed part
+          if (needed_bcount < bcount) {
+            mi_atomic_write(block + needed_bcount, mi_block_info_create(bcount - needed_bcount, false));
+            mi_atomic_write(block, mi_block_info_create(needed_bcount, true));
+          }
+          // return a pointer to the claimed memory
+          ptrdiff_t idx = (block - arena->blocks);
+          *is_zero = false;
+          *block_index = idx;
+          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
+        }
+      }
+    }
+  }
+  // no success
+  return NULL;
+}
+
+// Try to reduce search time by starting from bottom and wrap around.
+static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_zero, size_t* block_index)
+{
+  uintptr_t bottom = mi_atomic_read_relaxed(&arena->block_bottom);
+  void* p = mi_arena_allocx(arena, bottom, arena->block_count, needed_bcount, is_zero, block_index);
+  if (p == NULL && bottom > 0) {
+    // try again from the start
+    p = mi_arena_allocx(arena, 0, bottom, needed_bcount, is_zero, block_index);
+  }
+  if (p != NULL) {
+    mi_atomic_write(&arena->block_bottom, *block_index);
+  }
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) {
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = MI_MEMID_OS;
+  *is_zero = false;
+  bool default_large = false;
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`
+
+  // try to allocate in an arena if the alignment is small enough
+  // and if there is not too much waste around the `MI_ARENA_BLOCK_SIZE`.
+  if (alignment <= MI_SEGMENT_ALIGN &&
+      size >= 3*(MI_ARENA_BLOCK_SIZE/4) &&  // > 12MiB (not more than 25% waste)
+      !(size > MI_ARENA_BLOCK_SIZE && size < 3*(MI_ARENA_BLOCK_SIZE/2)) // ! <16MiB - 24MiB>
+     )
+  {
+    size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
+    size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
+
+    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      if (arena==NULL) break;
+      if (*large || !arena->is_large) { // large OS pages allowed, or arena is not large OS pages
+        size_t block_index = SIZE_MAX;
+        void* p = mi_arena_alloc(arena, bcount, is_zero, &block_index);
+        if (p != NULL) {
+          mi_assert_internal(block_index != SIZE_MAX);
+          #if MI_DEBUG>=1
+            _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
+            mi_block_info_t binfo = mi_atomic_read(block);
+            mi_assert_internal(mi_block_is_in_use(binfo));
+            mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
+          #endif
+          *memid  = mi_memid_create(i, block_index);
+          *commit = true;           // TODO: support commit on demand?
+          *large  = arena->is_large;
+          mi_assert_internal((uintptr_t)p % alignment == 0);
+          return p;
+        }
+      }
+    }
+  }
+
+  // fall back to the OS
+  *is_zero = true;
+  *memid = MI_MEMID_OS;
+  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
+}
+
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_zero, memid, tld);
+}
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  if (memid == MI_MEMID_OS) {
+    // was a direct OS allocation, pass through
+    _mi_os_free(p, size, stats);
+  }
+  else {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t block_idx;
+    mi_memid_indices(memid, &arena_idx, &block_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
+    mi_assert_internal(arena != NULL);
+    if (arena == NULL) {
+      _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->block_count > block_idx);
+    if (arena->block_count <= block_idx) {
+      _mi_fatal_error("trying to free from non-existent block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    _Atomic(mi_block_info_t)* block = &arena->blocks[block_idx];
+    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
+    mi_assert_internal(mi_block_is_in_use(binfo));
+    mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
+    if (!mi_block_is_in_use(binfo)) {
+      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
+      return;
+    };
+    bool ok = mi_atomic_cas_strong(block, mi_block_info_create(mi_block_count(binfo), false), binfo);
+    mi_assert_internal(ok);
+    if (!ok) {
+      _mi_warning_message("unable to free arena block: %p, info 0x%zx", p, binfo);
+    }
+    if (block_idx < mi_atomic_read_relaxed(&arena->block_bottom)) {
+      mi_atomic_write(&arena->block_bottom, block_idx);
+    }
+  }
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+  mi_assert_internal(mi_mem_is_zero(arena->blocks,arena->block_count*sizeof(mi_block_info_t)));
+
+  uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_subu(&mi_arena_count, 1);
+    return false;
+  }
+  mi_atomic_write_ptr(mi_atomic_cast(void*,&mi_arenas[i]), arena);
+  return true;
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+  TODO: improve OS api to just reserve and claim a huge
+  page area at once, (and return the total size).
+----------------------------------------------------------- */
+
+#include <errno.h>
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  size_t pages_reserved_default = 0;
+  if (pages_reserved==NULL) pages_reserved = &pages_reserved_default;
+  int err = _mi_os_reserve_huge_os_pages(pages, max_secs, pages_reserved);
+  if (*pages_reserved==0) return err;
+  size_t hsize = (*pages_reserved) * GiB;
+  void* p = _mi_os_try_alloc_from_huge_reserved(hsize, MI_SEGMENT_ALIGN);
+  mi_assert_internal(p != NULL);
+  if (p == NULL) return ENOMEM;
+  size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
+  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t)); // one too much
+  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_heap_default->tld->stats);
+  if (arena == NULL) return ENOMEM;
+  arena->block_count = bcount;
+  arena->start = (uint8_t*)p;
+  arena->block_bottom = 0;
+  arena->is_large = true;
+  arena->is_zero_init = true;
+  memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
+  //mi_atomic_write(&arena->blocks[0], mi_block_info_create(bcount, false));
+  mi_arena_add(arena);
+  return 0;
+}
diff --git a/src/memory.c b/src/memory.c
index dd03cf95..9ab7c850 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -50,6 +50,12 @@ void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* sta
 void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
 bool    _mi_os_is_huge_reserved(void* p);
 
+// arena.c
+void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+
+
 // Constants
 #if (MI_INTPTR_SIZE==8)
 #define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
@@ -87,6 +93,7 @@ typedef struct mem_region_s {
   volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
   volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
   volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
+  size_t   arena_memid; 
 } mem_region_t;
 
 
@@ -131,6 +138,30 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 }
 
 
+static size_t mi_memid_create(size_t idx, size_t bitidx) {
+  return ((idx*MI_REGION_MAP_BITS) + bitidx)<<1;
+}
+
+static size_t mi_memid_create_from_arena(size_t arena_memid) {
+  return (arena_memid << 1) | 1;
+}
+
+static bool mi_memid_is_arena(size_t id) {
+  return ((id&1)==1);
+}
+
+static bool mi_memid_indices(size_t id, size_t* idx, size_t* bitidx, size_t* arena_memid) {
+  if (mi_memid_is_arena(id)) {
+    *arena_memid = (id>>1);
+    return true;
+  }
+  else {
+    *idx = ((id>>1) / MI_REGION_MAP_BITS);
+    *bitidx = ((id>>1) % MI_REGION_MAP_BITS);
+    return false;
+  }
+}
+
 /* ----------------------------------------------------------------------------
 Commit from a region
 -----------------------------------------------------------------------------*/
@@ -153,6 +184,9 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
   {
     bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
     bool region_large  = *allow_large;
+    size_t arena_memid = 0;
+    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, is_zero, &arena_memid, tld);
+    /*
     void* start = NULL;
     if (region_large) {
       start = _mi_os_try_alloc_from_huge_reserved(MI_REGION_SIZE, MI_SEGMENT_ALIGN);
@@ -161,6 +195,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
     if (start == NULL) {
       start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, region_commit, &region_large, tld);
     }
+    */
     mi_assert_internal(!(region_large && !*allow_large));
 
     if (start == NULL) {
@@ -176,6 +211,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
     info = mi_region_info_create(start,region_large,region_commit);
     if (mi_atomic_cas_strong(&region->info, info, 0)) {
       // update the region count
+      region->arena_memid = arena_memid;
       mi_atomic_increment(&regions_count);
     }
     else {
@@ -183,6 +219,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       // we assign it to a later slot instead (up to 4 tries).
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
+          regions[idx+i].arena_memid = arena_memid;
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -190,7 +227,8 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       }
       if (start != NULL) {
         // free it if we didn't succeed to save it to some other region
-        _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
+        _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+        // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
       }
       // and continue with the memory at our index
       info = mi_atomic_read(&region->info);
@@ -229,7 +267,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
   mi_assert_internal(blocks_start != NULL);
   *allow_large = region_is_large;
   *p  = blocks_start;
-  *id = (idx*MI_REGION_MAP_BITS) + bitidx;
+  *id = mi_memid_create(idx, bitidx); 
   return true;
 }
 
@@ -269,7 +307,7 @@ static inline size_t mi_bsr(uintptr_t x) {
 
 // Allocate `blocks` in a `region` at `idx` of a given `size`.
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
+// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
 static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, 
                                    bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
@@ -366,15 +404,17 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
 {
   mi_assert_internal(id != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *id = SIZE_MAX;
+  *id = 0;
   *is_zero = false;
   bool default_large = false;
   if (large==NULL) large = &default_large;  // ensure `large != NULL`  
 
-  // use direct OS allocation for huge blocks or alignment (with `id = SIZE_MAX`)
+  // use direct OS allocation for huge blocks or alignment 
   if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
-    *is_zero = true;
-    return _mi_os_alloc_aligned(mi_good_commit_size(size), alignment, *commit, large, tld);  // round up size
+    size_t arena_memid = 0;
+    void* p = _mi_arena_alloc_aligned(mi_good_commit_size(size), alignment, commit, large, is_zero, &arena_memid, tld);  // round up size
+    *id = mi_memid_create_from_arena(arena_memid);
+    return p;
   }
 
   // always round size to OS page size multiple (so commit/decommit go over the entire range)
@@ -405,9 +445,10 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
 
   if (p == NULL) {
     // we could not find a place to allocate, fall back to the os directly
-    _mi_warning_message("unable to allocate from region: size %zu\n", size);
-    *is_zero = true;
-    p = _mi_os_alloc_aligned(size, alignment, commit, large, tld);
+    _mi_warning_message("unable to allocate from region: size %zu\n", size);    
+    size_t arena_memid = 0;
+    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+    *id = mi_memid_create_from_arena(arena_memid);
   }
   else {
     tld->region_idx = idx;  // next start of search? currently not used as we use first-fit
@@ -428,18 +469,19 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
   mi_assert_internal(size > 0 && stats != NULL);
   if (p==NULL) return;
   if (size==0) return;
-  if (id == SIZE_MAX) {
-   // was a direct OS allocation, pass through
-    _mi_os_free(p, size, stats);
+  size_t arena_memid = 0;
+  size_t idx = 0;
+  size_t bitidx = 0;
+  if (mi_memid_indices(id,&idx,&bitidx,&arena_memid)) {
+   // was a direct arena allocation, pass through
+    _mi_arena_free(p, size, arena_memid, stats);
   }
   else {
     // allocated in a region
     mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
-    size = _mi_align_up(size, _mi_os_page_size());
-    size_t idx = (id / MI_REGION_MAP_BITS);
-    size_t bitidx = (id % MI_REGION_MAP_BITS);
+    size = _mi_align_up(size, _mi_os_page_size());    
     size_t blocks = mi_region_block_count(size);
     size_t mask = mi_region_block_mask(blocks, bitidx);
     mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
@@ -503,11 +545,11 @@ void _mi_mem_collect(mi_stats_t* stats) {
         m = mi_atomic_read_relaxed(&region->map);
       } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
       if (m == 0) {
-        // on success, free the whole region (unless it was huge reserved)
+        // on success, free the whole region
         bool is_eager_committed;
         void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
-        if (start != NULL && !_mi_os_is_huge_reserved(start)) {
-          _mi_os_free_ex(start, MI_REGION_SIZE, is_eager_committed, stats);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
+          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, stats);
         }
         // and release
         mi_atomic_write(&region->info,0);
diff --git a/src/os.c b/src/os.c
index 8f5afc5b..85cd1a83 100644
--- a/src/os.c
+++ b/src/os.c
@@ -869,13 +869,13 @@ static void mi_os_free_huge_reserved() {
 */
 
 #if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+int _mi_os_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
   UNUSED(pages); UNUSED(max_secs);
   if (pages_reserved != NULL) *pages_reserved = 0;
   return ENOMEM; 
 }
 #else
-int mi_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reserved ) mi_attr_noexcept
+int _mi_os_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reserved ) mi_attr_noexcept
 {
   if (pages_reserved != NULL) *pages_reserved = 0;
   if (max_secs==0) return ETIMEDOUT; // timeout 

From aaf01620f4e878d48a4d2815bd0d894f28a5f093 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 31 Oct 2019 19:39:49 -0700
Subject: [PATCH 02/48] improve allocation of the huge OS page arena

---
 src/arena.c |  39 ++++++++++++-------
 src/os.c    | 110 +++++++++++++++++-----------------------------------
 2 files changed, 60 insertions(+), 89 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 5f33965a..469755f2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -6,7 +6,16 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_BLOCK_SIZE, 16MiB). Currently only used to
+allocate in one arena consisting of huge OS pages -- otherwise it 
+delegates to direct allocation from the OS.
 
+In the future, we can expose an API to manually add more arenas which
+is sometimes needed for embedded devices or shared memory for example.
+
+The arena allocation needs to be thread safe and we use a lock-free scan
+with on-demand coalescing.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
@@ -16,8 +25,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-void* _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
-int   _mi_os_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 
 /* -----------------------------------------------------------
   Arena allocation
@@ -338,25 +347,27 @@ static bool mi_arena_add(mi_arena_t* arena) {
 
 /* -----------------------------------------------------------
   Reserve a huge page arena.
-  TODO: improve OS api to just reserve and claim a huge
-  page area at once, (and return the total size).
 ----------------------------------------------------------- */
-
-#include <errno.h>
+#include <errno.h> // ENOMEM
 
 int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
   size_t pages_reserved_default = 0;
   if (pages_reserved==NULL) pages_reserved = &pages_reserved_default;
-  int err = _mi_os_reserve_huge_os_pages(pages, max_secs, pages_reserved);
-  if (*pages_reserved==0) return err;
-  size_t hsize = (*pages_reserved) * GiB;
-  void* p = _mi_os_try_alloc_from_huge_reserved(hsize, MI_SEGMENT_ALIGN);
-  mi_assert_internal(p != NULL);
-  if (p == NULL) return ENOMEM;
+  size_t hsize = 0;
+  void* p = NULL;
+  int err = _mi_os_alloc_huge_os_pages(pages, max_secs, &p, pages_reserved, &hsize);
+  _mi_verbose_message("reserved %zu huge pages\n", *pages_reserved);
+  if (p==NULL) return err;
+  // err might be != 0 but that is fine, we just got less pages.
+  mi_assert_internal(*pages_reserved > 0 && hsize > 0 && *pages_reserved <= pages);
   size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
   size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t)); // one too much
-  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_heap_default->tld->stats);
-  if (arena == NULL) return ENOMEM;
+  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main);
+  if (arena == NULL) {
+    *pages_reserved = 0;
+    _mi_os_free(p, hsize, &_mi_stats_main);
+    return ENOMEM;
+  }
   arena->block_count = bcount;
   arena->start = (uint8_t*)p;
   arena->block_bottom = 0;
diff --git a/src/os.c b/src/os.c
index 85cd1a83..b7bffa64 100644
--- a/src/os.c
+++ b/src/os.c
@@ -36,8 +36,6 @@ terms of the MIT license. A copy of the license can be found in the file
   large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
 ----------------------------------------------------------- */
 bool    _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool    _mi_os_is_huge_reserved(void* p);
-void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
 
 static void* mi_align_up_ptr(void* p, size_t alignment) {
   return (void*)_mi_align_up((uintptr_t)p, alignment);
@@ -184,7 +182,7 @@ void _mi_os_init() {
 
 static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
 {
-  if (addr == NULL || size == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
   bool err = false;
 #if defined(_WIN32)
   err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
@@ -628,7 +626,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
   *is_zero = false;
   size_t csize;
   void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
   int err = 0;
   if (commit) {
     _mi_stat_increase(&stats->committed, csize);
@@ -684,7 +682,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
   if (reset) _mi_stat_increase(&stats->reset, csize);
         else _mi_stat_decrease(&stats->reset, csize);
   if (!reset) return true; // nothing to do on unreset!
@@ -758,9 +756,11 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   size_t csize = 0;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return false;
+  /*
   if (_mi_os_is_huge_reserved(addr)) {
 	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
+  */
   int err = 0;
 #ifdef _WIN32
   DWORD oldprotect = 0;
@@ -816,79 +816,42 @@ will be reused.
 -----------------------------------------------------------------------------*/
 #define MI_HUGE_OS_PAGE_SIZE ((size_t)1 << 30)  // 1GiB
 
-typedef struct mi_huge_info_s {
-  volatile _Atomic(void*)  start;     // start of huge page area (32TiB)
-  volatile _Atomic(size_t) reserved;  // total reserved size
-  volatile _Atomic(size_t) used;      // currently allocated
-} mi_huge_info_t;
-
-static mi_huge_info_t os_huge_reserved = { NULL, 0, ATOMIC_VAR_INIT(0) };
-
-bool _mi_os_is_huge_reserved(void* p) {
-  return (mi_atomic_read_ptr(&os_huge_reserved.start) != NULL && 
-          p >= mi_atomic_read_ptr(&os_huge_reserved.start) &&
-          (uint8_t*)p < (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + mi_atomic_read(&os_huge_reserved.reserved));
-}
-
-void* _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment)
-{
-  // only allow large aligned allocations (e.g. regions)
-  if (size < MI_SEGMENT_SIZE || (size % MI_SEGMENT_SIZE) != 0) return NULL;
-  if (try_alignment > MI_SEGMENT_SIZE) return NULL;  
-  if (mi_atomic_read_ptr(&os_huge_reserved.start)==NULL) return NULL;
-  if (mi_atomic_read(&os_huge_reserved.used) >= mi_atomic_read(&os_huge_reserved.reserved)) return NULL; // already full
-
-  // always aligned
-  mi_assert_internal(mi_atomic_read(&os_huge_reserved.used) % MI_SEGMENT_SIZE == 0 );
-  mi_assert_internal( (uintptr_t)mi_atomic_read_ptr(&os_huge_reserved.start) % MI_SEGMENT_SIZE == 0 );
-  
-  // try to reserve space
-  size_t base = mi_atomic_addu( &os_huge_reserved.used, size );
-  if ((base + size) > os_huge_reserved.reserved) {
-    // "free" our over-allocation
-    mi_atomic_subu( &os_huge_reserved.used, size);
-    return NULL;
-  }
-
-  // success!
-  uint8_t* p = (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + base;
-  mi_assert_internal( (uintptr_t)p % MI_SEGMENT_SIZE == 0 );
-  return p;
-}
-
-/*
-static void mi_os_free_huge_reserved() {
-  uint8_t* addr = os_huge_reserved.start;
-  size_t total  = os_huge_reserved.reserved;
-  os_huge_reserved.reserved = 0;
-  os_huge_reserved.start = NULL;
-  for( size_t current = 0; current < total; current += MI_HUGE_OS_PAGE_SIZE) {
-    _mi_os_free(addr + current, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
-  }
-}
-*/
 
 #if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
-int _mi_os_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** start, size_t* pages_reserved, size_t* size) mi_attr_noexcept {
   UNUSED(pages); UNUSED(max_secs);
+  if (start != NULL) *start = NULL;
   if (pages_reserved != NULL) *pages_reserved = 0;
+  if (size != NULL) *size = 0;
   return ENOMEM; 
 }
 #else
-int _mi_os_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reserved ) mi_attr_noexcept
+static _Atomic(uintptr_t) huge_top; // = 0
+
+int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept 
 {
-  if (pages_reserved != NULL) *pages_reserved = 0;
+  *pstart = NULL;
+  *pages_reserved = 0;
+  *psize = 0;
   if (max_secs==0) return ETIMEDOUT; // timeout 
   if (pages==0) return 0;            // ok
-  if (!mi_atomic_cas_ptr_strong(&os_huge_reserved.start,(void*)1,NULL)) return ETIMEDOUT; // already reserved
 
-  // Set the start address after the 32TiB area
-  uint8_t* start = (uint8_t*)((uintptr_t)32 << 40); // 32TiB virtual start address
-  #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of huge pages unless in debug mode
-  uintptr_t r = _mi_random_init((uintptr_t)&mi_reserve_huge_os_pages);
-  start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
-  #endif
+  // Atomically claim a huge address range
+  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+  uint8_t* start;
+  do {
+    start = (uint8_t*)mi_atomic_addu(&huge_top, size);  
+    if (start == NULL) {
+      uintptr_t top = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+      #if (MI_SECURE>0 || MI_DEBUG==0)        // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_random_init((uintptr_t)&_mi_os_alloc_huge_os_pages);
+      top += ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+      #endif    
+      mi_atomic_cas_strong(&huge_top, top, 0);
+    }
+  } while (start == NULL);
 
+  
   // Allocate one page at the time but try to place them contiguously
   // We allocate one page at the time to be able to abort if it takes too long
   double start_t = _mi_clock_start();
@@ -925,16 +888,13 @@ int _mi_os_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_r
     }
     // success, record it
     if (page==0) {
-      mi_atomic_write_ptr(&os_huge_reserved.start, addr);  // don't switch the order of these writes
-      mi_atomic_write(&os_huge_reserved.reserved, MI_HUGE_OS_PAGE_SIZE);
+      *pstart = addr;
     }
-    else {
-      mi_atomic_addu(&os_huge_reserved.reserved,MI_HUGE_OS_PAGE_SIZE);
-    }
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); 
+    *psize += MI_HUGE_OS_PAGE_SIZE; 
+    *pages_reserved += 1;
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
-    if (pages_reserved != NULL) { *pages_reserved = page + 1; }
-
+    
     // check for timeout
     double elapsed = _mi_clock_end(start_t);
     if (elapsed > max_secs) return ETIMEDOUT; 
@@ -943,7 +903,7 @@ int _mi_os_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_r
       if (estimate > 1.5*max_secs) return ETIMEDOUT; // seems like we are going to timeout
     }
   }  
-  _mi_verbose_message("reserved %zu huge pages\n", pages);
+  mi_assert_internal(*psize == size);
   return 0;
 }
 #endif

From a6499be074a52232ed131eeabb3bd8040f2743c3 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 19:53:07 -0700
Subject: [PATCH 03/48] initial numa support for arenas

---
 include/mimalloc-internal.h |   1 +
 include/mimalloc.h          |   8 +-
 src/arena.c                 | 128 +++++++++++++-----
 src/init.c                  |   2 +-
 src/options.c               |   3 +-
 src/os.c                    | 252 +++++++++++++++++++-----------------
 6 files changed, 241 insertions(+), 153 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 2b881ac9..dd677a02 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -56,6 +56,7 @@ void       _mi_os_init(void);                                      // called fro
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
+int        _mi_os_numa_node(void);
 
 
 // memory.c
diff --git a/include/mimalloc.h b/include/mimalloc.h
index b63ed79d..b155aca6 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -228,9 +228,14 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 
 // Experimental
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
 
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept;
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@@ -271,6 +276,7 @@ typedef enum mi_option_e {
   mi_option_eager_commit_delay,
   mi_option_segment_reset,
   mi_option_os_tag,
+  mi_option_max_numa_node,
   _mi_option_last
 } mi_option_t;
 
diff --git a/src/arena.c b/src/arena.c
index 469755f2..5bc3900c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -25,8 +25,10 @@ with on-demand coalescing.
 
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
+//int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize);
+int   _mi_os_numa_node_count(void);
 
 /* -----------------------------------------------------------
   Arena allocation
@@ -44,6 +46,7 @@ typedef uintptr_t mi_block_info_t;
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  int      numa_node;                     // associated NUMA node
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_large;                      // large OS page allocated
   _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
@@ -223,7 +226,31 @@ static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_ze
   Arena Allocation
 ----------------------------------------------------------- */
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) {
+static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
+                                    bool* commit, bool* large, bool* is_zero,
+                                    size_t* memid) 
+{
+  size_t block_index = SIZE_MAX;
+  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
+  if (p != NULL) {
+    mi_assert_internal(block_index != SIZE_MAX);
+#if MI_DEBUG>=1
+    _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
+    mi_block_info_t binfo = mi_atomic_read(block);
+    mi_assert_internal(mi_block_is_in_use(binfo));
+    mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
+#endif
+    *memid = mi_memid_create(arena_index, block_index);
+    *commit = true;           // TODO: support commit on demand?
+    *large = arena->is_large;
+  }
+  return p;
+}
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
+                              bool* commit, bool* large, bool* is_zero, 
+                              size_t* memid, mi_os_tld_t* tld) 
+{
   mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
   *memid = MI_MEMID_OS;
@@ -240,33 +267,36 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool*
   {
     size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
     size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
+    int numa_node = _mi_os_numa_node(); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+    // try numa affine allocation
     for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
       mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
-      if (arena==NULL) break;
-      if (*large || !arena->is_large) { // large OS pages allowed, or arena is not large OS pages
-        size_t block_index = SIZE_MAX;
-        void* p = mi_arena_alloc(arena, bcount, is_zero, &block_index);
-        if (p != NULL) {
-          mi_assert_internal(block_index != SIZE_MAX);
-          #if MI_DEBUG>=1
-            _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
-            mi_block_info_t binfo = mi_atomic_read(block);
-            mi_assert_internal(mi_block_is_in_use(binfo));
-            mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
-          #endif
-          *memid  = mi_memid_create(i, block_index);
-          *commit = true;           // TODO: support commit on demand?
-          *large  = arena->is_large;
-          mi_assert_internal((uintptr_t)p % alignment == 0);
-          return p;
-        }
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      { 
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
+    }
+    // try from another numa node instead..
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
       }
     }
   }
 
-  // fall back to the OS
+  // finally, fall back to the OS
   *is_zero = true;
   *memid = MI_MEMID_OS;
   return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
@@ -350,31 +380,61 @@ static bool mi_arena_add(mi_arena_t* arena) {
 ----------------------------------------------------------- */
 #include <errno.h> // ENOMEM
 
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  size_t pages_reserved_default = 0;
-  if (pages_reserved==NULL) pages_reserved = &pages_reserved_default;
+// reserve at a specific numa node
+static int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
   size_t hsize = 0;
-  void* p = NULL;
-  int err = _mi_os_alloc_huge_os_pages(pages, max_secs, &p, pages_reserved, &hsize);
-  _mi_verbose_message("reserved %zu huge pages\n", *pages_reserved);
-  if (p==NULL) return err;
-  // err might be != 0 but that is fine, we just got less pages.
-  mi_assert_internal(*pages_reserved > 0 && hsize > 0 && *pages_reserved <= pages);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, &hsize);
+  if (p==NULL) return ENOMEM;
+  _mi_verbose_message("reserved %zu huge (1GiB) pages\n", pages);
+  
   size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
-  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t)); // one too much
-  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main);
+  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
+  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) {
-    *pages_reserved = 0;
     _mi_os_free(p, hsize, &_mi_stats_main);
     return ENOMEM;
   }
   arena->block_count = bcount;
   arena->start = (uint8_t*)p;
   arena->block_bottom = 0;
+  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large = true;
   arena->is_zero_init = true;
   memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
-  //mi_atomic_write(&arena->blocks[0], mi_block_info_create(bcount, false));
   mi_arena_add(arena);
   return 0;
 }
+
+
+// reserve huge pages evenly among all numa nodes. 
+int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  int numa_count = _mi_os_numa_node_count();
+  if (numa_count <= 0) numa_count = 1;
+  size_t pages_per = pages / numa_count;
+  if (pages_per == 0) pages_per = 1;
+  
+  // reserve evenly among numa nodes
+  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    int err = mi_reserve_huge_os_pages_at((pages_per > pages ? pages : pages_per), numa_node);
+    if (err) return err;
+    if (pages < pages_per) {
+      pages = 0;
+    }
+    else {
+      pages -= pages_per;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  _mi_verbose_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages);  
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
diff --git a/src/init.c b/src/init.c
index e15d82eb..138b54aa 100644
--- a/src/init.c
+++ b/src/init.c
@@ -435,7 +435,7 @@ static void mi_process_load(void) {
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
     double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
-    mi_reserve_huge_os_pages(pages, max_secs, NULL);
+    mi_reserve_huge_os_pages_interleave(pages);
   }
 }
 
diff --git a/src/options.c b/src/options.c
index a49c46ed..32f13d54 100644
--- a/src/options.c
+++ b/src/options.c
@@ -66,7 +66,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
-  { 100, UNINIT, MI_OPTION(os_tag) }             // only apple specific for now but might serve more or less related purpose
+  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 256, UNINIT, MI_OPTION(max_numa_node) }      // maximum allowed numa node
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/os.c b/src/os.c
index b7bffa64..c0564174 100644
--- a/src/os.c
+++ b/src/os.c
@@ -170,7 +170,7 @@ void _mi_os_init() {
     os_alloc_granularity = os_page_size;
   }
   if (mi_option_is_enabled(mi_option_large_os_pages)) {
-    large_os_page_size = (1UL << 21); // 2MiB
+    large_os_page_size = 2*MiB;
   }
 }
 #endif
@@ -207,31 +207,6 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 
 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */
-    && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0
-    && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0)
-    && pNtAllocateVirtualMemoryEx != NULL)
-  {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
-    MEM_EXTENDED_PARAMETER param = { 0, 0 };
-    param.Type = 5; // == MemExtendedParameterAttributeFlags;
-    param.ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    SIZE_T psize = size;
-    void*  base  = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, &param, 1);
-    if (err == 0) {
-      return base;
-    }
-    else {
-      // else fall back to regular large OS pages
-      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err);
-    }
-  }
-#endif
 #if (MI_INTPTR_SIZE >= 8) 
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
@@ -364,7 +339,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
       lflags |= MAP_HUGETLB;
       #endif
       #ifdef MAP_HUGE_1GB
-      if ((size % ((uintptr_t)1 << 30)) == 0) {
+      if ((size % GiB) == 0) {
         lflags |= MAP_HUGE_1GB;
       }
       else
@@ -400,10 +375,10 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
     p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
     #if defined(MADV_HUGEPAGE)
     // Many Linux systems don't allow MAP_HUGETLB but they support instead
-    // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE
+    // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
     // though since properly aligned allocations will already use large pages if available
     // in that case -- in particular for our large regions (in `memory.c`).
-    // However, some systems only allow TPH if called with explicit `madvise`, so
+    // However, some systems only allow THP if called with explicit `madvise`, so
     // when large OS pages are enabled for mimalloc, we call `madvice` anyways.
     if (allow_large && use_large_os_page(size, try_alignment)) {
       if (madvise(p, size, MADV_HUGEPAGE) == 0) {
@@ -810,101 +785,146 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
 
 
 /* ----------------------------------------------------------------------------
-Support for huge OS pages (1Gib) that are reserved up-front and never
-released. Only regions are allocated in here (see `memory.c`) so the memory
-will be reused.
+Support for allocating huge OS pages (1Gib) that are reserved up-front 
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE ((size_t)1 << 30)  // 1GiB
+#define MI_HUGE_OS_PAGE_SIZE  (GiB)  
 
+#if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) 
+{  
+  mi_assert_internal(size%GiB == 0);
 
-#if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
-int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** start, size_t* pages_reserved, size_t* size) mi_attr_noexcept {
-  UNUSED(pages); UNUSED(max_secs);
-  if (start != NULL) *start = NULL;
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  if (size != NULL) *size = 0;
-  return ENOMEM; 
-}
-#else
-static _Atomic(uintptr_t) huge_top; // = 0
-
-int _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept 
-{
-  *pstart = NULL;
-  *pages_reserved = 0;
-  *psize = 0;
-  if (max_secs==0) return ETIMEDOUT; // timeout 
-  if (pages==0) return 0;            // ok
-
-  // Atomically claim a huge address range
-  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
-  uint8_t* start;
-  do {
-    start = (uint8_t*)mi_atomic_addu(&huge_top, size);  
-    if (start == NULL) {
-      uintptr_t top = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-      #if (MI_SECURE>0 || MI_DEBUG==0)        // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_random_init((uintptr_t)&_mi_os_alloc_huge_os_pages);
-      top += ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
-      #endif    
-      mi_atomic_cas_strong(&huge_top, top, 0);
-    }
-  } while (start == NULL);
-
+  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+  DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+  MEM_EXTENDED_PARAMETER params[4] = { {0,0},{0,0},{0,0},{0,0} };
+  MEM_ADDRESS_REQUIREMENTS reqs = {0,0,0};
+  reqs.HighestEndingAddress = NULL;
+  reqs.LowestStartingAddress = NULL;
+  reqs.Alignment = MI_SEGMENT_SIZE;
   
-  // Allocate one page at the time but try to place them contiguously
-  // We allocate one page at the time to be able to abort if it takes too long
-  double start_t = _mi_clock_start();
-  uint8_t* addr = start;  // current top of the allocations
-  for (size_t page = 0; page < pages; page++, addr += MI_HUGE_OS_PAGE_SIZE ) {
-    // allocate a page
-    void* p = NULL; 
-    bool is_large = true;
-    #ifdef _WIN32
-    if (page==0) { mi_win_enable_large_os_pages(); }
-    p = mi_win_virtual_alloc(addr, MI_HUGE_OS_PAGE_SIZE, 0, MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE, true, true, &is_large);
-    #elif defined(MI_OS_USE_MMAP)
-    p = mi_unix_mmap(addr, MI_HUGE_OS_PAGE_SIZE, 0, PROT_READ | PROT_WRITE, true, true, &is_large);
-    #else 
-    // always fail
-    #endif  
-    
-    // Did we succeed at a contiguous address?
-    if (p != addr) {
-      // no success, issue a warning and return with an error 
-      if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr); 
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main );
-      }
-      else {
-        #ifdef _WIN32
-        int err = GetLastError();
-        #else
-        int err = errno;
-        #endif
-        _mi_warning_message("could not allocate huge page %zu at 0x%p, error: %i\n", page, addr, err);
-      }
-      return ENOMEM;  
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages  
+  if (pNtAllocateVirtualMemoryEx != NULL) {
+    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
+    #endif
+    params[0].Type = MemExtendedParameterAddressRequirements;
+    params[0].Pointer = &reqs;
+    params[1].Type = 5; // == MemExtendedParameterAttributeFlags;
+    params[1].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    size_t param_count = 2;
+    if (numa_node >= 0) {
+      param_count++;
+      params[2].Type = MemExtendedParameterNumaNode;
+      params[2].ULong = (unsigned)numa_node;
     }
-    // success, record it
-    if (page==0) {
-      *pstart = addr;
+    SIZE_T psize = size;
+    void* base = NULL;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0) {
+      return base;
     }
-    *psize += MI_HUGE_OS_PAGE_SIZE; 
-    *pages_reserved += 1;
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
-    
-    // check for timeout
-    double elapsed = _mi_clock_end(start_t);
-    if (elapsed > max_secs) return ETIMEDOUT; 
-    if (page >= 1) {
-      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
-      if (estimate > 1.5*max_secs) return ETIMEDOUT; // seems like we are going to timeout
+    else {
+      // fall back to regular huge pages    
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err);
     }
   }  
-  mi_assert_internal(*psize == size);
-  return 0;
+  // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation
+  if (pVirtualAlloc2 != NULL) {
+    params[0].Type = MemExtendedParameterAddressRequirements;
+    params[0].Pointer = &reqs;
+    size_t param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type = MemExtendedParameterNumaNode;
+      params[1].ULong = (unsigned)numa_node;
+    }
+    return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count);
+  }
+  #endif
+  return NULL; // give up on older Windows.. 
+}
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
+#ifdef MI_HAS_NUMA
+#include <numaif.h> // mbind, and use -lnuma
+#endif
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+  mi_assert_internal(size%GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(NULL, MI_HUGE_OS_PAGE_SIZE, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  #ifdef MI_HAS_NUMA  
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
+    uintptr_t numa_mask = (1UL << numa_node);
+    long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
+    }
+  }
+  #endif
+  return p;
+}
+#else 
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+  return NULL;
 }
 #endif
 
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+  void* p = mi_os_alloc_huge_os_pagesx(size, numa_node);
+  if (p==NULL) return NULL;
+  if (psize != NULL) *psize = size;
+  _mi_stat_increase(&_mi_stats_main.committed, size);
+  _mi_stat_increase(&_mi_stats_main.reserved, size);
+  return p;
+}
+
+#ifdef WIN32
+static int mi_os_numa_nodex(void) {
+  PROCESSOR_NUMBER pnum;
+  USHORT numa_node = 0;
+  GetCurrentProcessorNumberEx(&pnum);
+  GetNumaProcessorNodeEx(&pnum,&numa_node);
+  return (int)numa_node; 
+}
+
+static int mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  return (int)(numa_max + 1);
+}
+#elif MI_HAS_NUMA
+#include <numa.h>
+static int mi_os_numa_nodex(void) {
+  return numa_preferred();
+}
+static int mi_os_numa_node_countx(void) {
+  return (numa_max_node() + 1);
+}
+#else
+static int mi_os_numa_nodex(void) {
+  return 0;
+}
+static int mi_os_numa_node_countx(void) {
+  return 1;
+}
+#endif
+
+int _mi_os_numa_node_count(void) {
+  long ncount = mi_os_numa_node_countx();
+  // never more than max numa node and at least 1
+  long nmax  = 1 + mi_option_get(mi_option_max_numa_node);
+  if (ncount > nmax) ncount = nmax;
+  if (ncount <= 0) ncount = 1;
+  return ncount;
+}
+
+int _mi_os_numa_node(void) {
+  int nnode = mi_os_numa_nodex();
+  // never more than the node count
+  int ncount = _mi_os_numa_node_count();
+  if (nnode >= ncount) { nnode = nnode % ncount; }  
+  return nnode;
+}

From 3fadf4abaf5ee91c38c6e593a1faabb28d9ab2f9 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 20:01:08 -0700
Subject: [PATCH 04/48] initial numa awareness for regions

---
 src/memory.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index 9ab7c850..02e82e4d 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -45,10 +45,8 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
-void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
-bool    _mi_os_is_huge_reserved(void* p);
+//void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
+//void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
 
 // arena.c
 void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
@@ -93,7 +91,8 @@ typedef struct mem_region_s {
   volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
   volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
   volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
-  size_t   arena_memid; 
+  volatile _Atomic(uintptr_t)        numa_node;  // associated numa node + 1 (so 0 is no association)
+  size_t   arena_memid;  // if allocated from a (huge page) arena
 } mem_region_t;
 
 
@@ -212,6 +211,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
     if (mi_atomic_cas_strong(&region->info, info, 0)) {
       // update the region count
       region->arena_memid = arena_memid;
+      mi_atomic_write(&region->numa_node, _mi_os_numa_node() + 1);
       mi_atomic_increment(&regions_count);
     }
     else {
@@ -220,6 +220,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
           regions[idx+i].arena_memid = arena_memid;
+          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node() + 1);
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -365,15 +366,18 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, 
-                                       bool* commit, bool* allow_large, bool* is_zero, 
-                                       void** p, size_t* id, mi_os_tld_t* tld)
+static bool mi_region_try_alloc_blocks(int numa_node, size_t idx, size_t blocks, size_t size,
+  bool* commit, bool* allow_large, bool* is_zero,
+  void** p, size_t* id, mi_os_tld_t* tld)
 {
   // check if there are available blocks in the region..
   mi_assert_internal(idx < MI_REGION_MAX);
   mem_region_t* region = &regions[idx];
   uintptr_t m = mi_atomic_read_relaxed(&region->map);
-  if (m != MI_REGION_MAP_FULL) {  // some bits are zero    
+  int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
+  if ((rnode < 0 || rnode == numa_node) &&  // fits current numa node
+      (m != MI_REGION_MAP_FULL))            // and some bits are zero    
+  {
     bool ok = (*commit || *allow_large); // committing or allow-large is always ok
     if (!ok) {
       // otherwise skip incompatible regions if possible. 
@@ -426,19 +430,20 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
 
   // find a range of free blocks
+  int numa_node = _mi_os_numa_node();
   void* p = NULL;
   size_t count = mi_atomic_read(&regions_count);
   size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
-    if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
+    if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
     if (p != NULL) break;
   }
 
   if (p == NULL) {
     // no free range in existing regions -- try to extend beyond the count.. but at most 8 regions
     for (idx = count; idx < mi_atomic_read_relaxed(&regions_count) + 8 && idx < MI_REGION_MAX; idx++) {
-      if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
+      if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
       if (p != NULL) break;
     }
   }

From 2d10c78587d6cf781ffb40c24cb727ecff625841 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 20:19:00 -0700
Subject: [PATCH 05/48] fix linux compilation

---
 CMakeLists.txt | 1 +
 src/arena.c    | 3 ++-
 src/init.c     | 2 +-
 src/os.c       | 4 +++-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81cc339a..e9eb6feb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,7 @@ set(mi_install_dir "lib/mimalloc-${mi_version}")
 set(mi_sources
     src/stats.c
     src/os.c
+    src/arena.c
     src/memory.c
     src/segment.c
     src/page.c
diff --git a/src/arena.c b/src/arena.c
index 5bc3900c..bb1c1c10 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -381,7 +381,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
 #include <errno.h> // ENOMEM
 
 // reserve at a specific numa node
-static int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
   size_t hsize = 0;
   void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, &hsize);
   if (p==NULL) return ENOMEM;
@@ -432,6 +432,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
 }
 
 int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  UNUSED(max_secs);
   _mi_verbose_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
   int err = mi_reserve_huge_os_pages_interleave(pages);  
diff --git a/src/init.c b/src/init.c
index 138b54aa..0813fddd 100644
--- a/src/init.c
+++ b/src/init.c
@@ -434,7 +434,7 @@ static void mi_process_load(void) {
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
-    double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
+    // double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
     mi_reserve_huge_os_pages_interleave(pages);
   }
 }
diff --git a/src/os.c b/src/os.c
index c0564174..2bb3ee3c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -851,7 +851,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
 static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   mi_assert_internal(size%GiB == 0);
   bool is_large = true;
-  void* p = mi_unix_mmap(NULL, MI_HUGE_OS_PAGE_SIZE, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
   #ifdef MI_HAS_NUMA  
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
@@ -861,6 +861,8 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
       _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
     }
   }
+  #else
+  UNUSED(numa_node);
   #endif
   return p;
 }

From 57dd69265ad294e7cdfcc13ef7ecb69b7c5d61b1 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 20:30:01 -0700
Subject: [PATCH 06/48] normalize numa node

---
 src/arena.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/arena.c b/src/arena.c
index bb1c1c10..381d4486 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -383,6 +383,8 @@ static bool mi_arena_add(mi_arena_t* arena) {
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
   size_t hsize = 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, &hsize);
   if (p==NULL) return ENOMEM;
   _mi_verbose_message("reserved %zu huge (1GiB) pages\n", pages);

From 2c12d7f2234b25308478e22c9342a07623b6f891 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 22:01:52 -0700
Subject: [PATCH 07/48] optimized numa calls; better Linux support

---
 CMakeLists.txt              |  12 ++++
 include/mimalloc-internal.h |   2 +-
 include/mimalloc-types.h    |   1 +
 src/arena.c                 |   2 +-
 src/init.c                  |   3 +-
 src/memory.c                |   6 +-
 src/os.c                    | 114 ++++++++++++++++++++++++------------
 7 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9eb6feb..1e96c237 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.0)
 project(libmimalloc C CXX)
 include("cmake/mimalloc-config-version.cmake")
+include("CheckIncludeFile")
+
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
 
@@ -88,6 +90,16 @@ if(MI_USE_CXX MATCHES "ON")
   set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX )
 endif()
 
+CHECK_INCLUDE_FILE("numaif.h" MI_HAVE_NUMA_H)
+if(MI_HAVE_NUMA_H)
+  list(APPEND mi_defines MI_HAS_NUMA)
+  list(APPEND mi_libraries numa)
+else()
+  if (NOT(WIN32))
+    message(WARNING "Compiling without using NUMA optimized allocation (on Linux, install libnuma-dev?)")
+  endif()
+endif()
+
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
   list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas)
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index dd677a02..b4d3351d 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -56,7 +56,7 @@ void       _mi_os_init(void);                                      // called fro
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
-int        _mi_os_numa_node(void);
+int        _mi_os_numa_node(mi_os_tld_t* tld);
 
 
 // memory.c
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..0208d5c7 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -413,6 +413,7 @@ typedef struct mi_segments_tld_s {
 // OS thread local data
 typedef struct mi_os_tld_s {
   size_t              region_idx;   // start point for next allocation
+  int                 numa_node;    // numa node associated with this thread
   mi_stats_t*         stats;        // points to tld stats
 } mi_os_tld_t;
 
diff --git a/src/arena.c b/src/arena.c
index 381d4486..7eb755c4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -267,7 +267,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
   {
     size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
     size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-    int numa_node = _mi_os_numa_node(); // current numa node
+    int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
     // try numa affine allocation
diff --git a/src/init.c b/src/init.c
index 0813fddd..166ca451 100644
--- a/src/init.c
+++ b/src/init.c
@@ -99,7 +99,7 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, tld_main_stats },       // os
+  { 0, -1, tld_main_stats },   // os
   { MI_STATS_NULL }            // stats
 };
 
@@ -218,6 +218,7 @@ static bool _mi_heap_init(void) {
     memset(tld, 0, sizeof(*tld));
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
+    tld->os.numa_node = -1;
     tld->os.stats = &tld->stats;
     _mi_heap_default = heap;
   }
diff --git a/src/memory.c b/src/memory.c
index 02e82e4d..a425393c 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -211,7 +211,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
     if (mi_atomic_cas_strong(&region->info, info, 0)) {
       // update the region count
       region->arena_memid = arena_memid;
-      mi_atomic_write(&region->numa_node, _mi_os_numa_node() + 1);
+      mi_atomic_write(&region->numa_node, _mi_os_numa_node(tld) + 1);
       mi_atomic_increment(&regions_count);
     }
     else {
@@ -220,7 +220,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
           regions[idx+i].arena_memid = arena_memid;
-          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node() + 1);
+          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -430,7 +430,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
 
   // find a range of free blocks
-  int numa_node = _mi_os_numa_node();
+  int numa_node = _mi_os_numa_node(tld);
   void* p = NULL;
   size_t count = mi_atomic_read(&regions_count);
   size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
diff --git a/src/os.c b/src/os.c
index 2bb3ee3c..677d0ea2 100644
--- a/src/os.c
+++ b/src/os.c
@@ -97,7 +97,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-static bool mi_win_enable_large_os_pages() 
+static bool mi_win_enable_large_os_pages()
 {
   if (large_os_page_size > 0) return true;
 
@@ -148,10 +148,10 @@ void _mi_os_init(void) {
     FreeLibrary(hDll);
   }
   hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {    
+  if (hDll != NULL) {
     pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
     FreeLibrary(hDll);
-  }  
+  }
   if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     mi_win_enable_large_os_pages();
   }
@@ -191,7 +191,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
 #else
   err = (munmap(addr, size) == -1);
 #endif
-  if (was_committed) _mi_stat_decrease(&stats->committed, size); 
+  if (was_committed) _mi_stat_decrease(&stats->committed, size);
   _mi_stat_decrease(&stats->reserved, size);
   if (err) {
 #pragma warning(suppress:4996)
@@ -207,14 +207,14 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 
 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if (MI_INTPTR_SIZE >= 8) 
+#if (MI_INTPTR_SIZE >= 8)
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
   if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
     return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
   }
 #endif
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)  
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
@@ -232,7 +232,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   mi_assert_internal(!(large_only && !allow_large));
   static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
-  if ((large_only || use_large_os_page(size, try_alignment)) 
+  if ((large_only || use_large_os_page(size, try_alignment))
       && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
@@ -372,7 +372,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   }
   if (p == NULL) {
     *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
     #if defined(MADV_HUGEPAGE)
     // Many Linux systems don't allow MAP_HUGETLB but they support instead
     // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
@@ -391,7 +391,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 }
 #endif
 
-// On 64-bit systems, we can do efficient aligned allocation by using 
+// On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
 static volatile _Atomic(intptr_t) aligned_base;
@@ -785,14 +785,14 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
 
 
 /* ----------------------------------------------------------------------------
-Support for allocating huge OS pages (1Gib) that are reserved up-front 
+Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE  (GiB)  
+#define MI_HUGE_OS_PAGE_SIZE  (GiB)
 
 #if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) 
-{  
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
+{
   mi_assert_internal(size%GiB == 0);
 
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
@@ -802,8 +802,8 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
   reqs.HighestEndingAddress = NULL;
   reqs.LowestStartingAddress = NULL;
   reqs.Alignment = MI_SEGMENT_SIZE;
-  
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages  
+
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   if (pNtAllocateVirtualMemoryEx != NULL) {
     #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
     #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
@@ -825,10 +825,10 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
       return base;
     }
     else {
-      // fall back to regular huge pages    
+      // fall back to regular huge pages
       _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err);
     }
-  }  
+  }
   // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation
   if (pVirtualAlloc2 != NULL) {
     params[0].Type = MemExtendedParameterAddressRequirements;
@@ -842,7 +842,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
     return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count);
   }
   #endif
-  return NULL; // give up on older Windows.. 
+  return NULL; // give up on older Windows..
 }
 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
 #ifdef MI_HAS_NUMA
@@ -853,7 +853,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   bool is_large = true;
   void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
-  #ifdef MI_HAS_NUMA  
+  #ifdef MI_HAS_NUMA
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
     uintptr_t numa_mask = (1UL << numa_node);
     long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
@@ -866,7 +866,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   #endif
   return p;
 }
-#else 
+#else
 static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   return NULL;
 }
@@ -884,12 +884,12 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
 }
 
 #ifdef WIN32
-static int mi_os_numa_nodex(void) {
+static int mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
   USHORT numa_node = 0;
   GetCurrentProcessorNumberEx(&pnum);
   GetNumaProcessorNodeEx(&pnum,&numa_node);
-  return (int)numa_node; 
+  return (int)numa_node;
 }
 
 static int mi_os_numa_node_countx(void) {
@@ -898,12 +898,42 @@ static int mi_os_numa_node_countx(void) {
   return (int)(numa_max + 1);
 }
 #elif MI_HAS_NUMA
-#include <numa.h>
+#include <dirent.h>
+#include <stdlib.h>
+#include <numaif.h>
 static int mi_os_numa_nodex(void) {
-  return numa_preferred();
+  #define MI_MAX_MASK (4)          // support at most 256 nodes
+  unsigned long mask[MI_MAX_MASK];
+  memset(mask,0,MI_MAX_MASK*sizeof(long));
+  int mode = 0;
+  long err = get_mempolicy(&mode, mask, MI_MAX_MASK*sizeof(long)*8, NULL, 0 /* thread policy */);
+  if (err != 0) return 0;
+  // find the lowest bit that is set
+  for(int i = 0; i < MI_MAX_MASK; i++) {
+    for(int j = 0; j < (int)(sizeof(long)*8); j++) {
+      if ((mask[i] & (1UL << j)) != 0) {
+        return (i*sizeof(long)*8 + j);
+      }
+    }
+  }
+	return 0;
 }
+
 static int mi_os_numa_node_countx(void) {
-  return (numa_max_node() + 1);
+  DIR* d = opendir("/sys/devices/system/node");
+  if (d==NULL) return 1;
+  
+  struct dirent* de;
+  int max_node_num = 0;
+  while ((de = readdir(d)) != NULL) {
+  	int node_num;
+  	if (strncmp(de->d_name, "node", 4) == 0) {
+		  node_num = (int)strtol(de->d_name+4, NULL, 0);
+			if (max_node_num < node_num) max_node_num = node_num;
+    }
+  }
+  closedir(d);
+  return (max_node_num + 1);
 }
 #else
 static int mi_os_numa_nodex(void) {
@@ -915,18 +945,28 @@ static int mi_os_numa_node_countx(void) {
 #endif
 
 int _mi_os_numa_node_count(void) {
-  long ncount = mi_os_numa_node_countx();
-  // never more than max numa node and at least 1
-  long nmax  = 1 + mi_option_get(mi_option_max_numa_node);
-  if (ncount > nmax) ncount = nmax;
-  if (ncount <= 0) ncount = 1;
-  return ncount;
+  static int numa_node_count = 0;
+  if (mi_unlikely(numa_node_count <= 0)) {
+    int ncount = mi_os_numa_node_countx();
+    // never more than max numa node and at least 1
+    int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
+    if (ncount > nmax) ncount = nmax;
+    if (ncount <= 0)   ncount = 1;
+    numa_node_count = ncount;
+  }
+  mi_assert_internal(numa_node_count >= 1);
+  return numa_node_count;
 }
 
-int _mi_os_numa_node(void) {
-  int nnode = mi_os_numa_nodex();
-  // never more than the node count
-  int ncount = _mi_os_numa_node_count();
-  if (nnode >= ncount) { nnode = nnode % ncount; }  
-  return nnode;
+int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_unlikely(tld->numa_node < 0)) {
+    int nnode = mi_os_numa_nodex();
+    // never more than the node count
+    int ncount = _mi_os_numa_node_count();
+    if (nnode >= ncount) { nnode = nnode % ncount; }
+    if (nnode < 0) nnode = 0;
+    tld->numa_node = nnode;
+  }
+  mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count());
+  return tld->numa_node;
 }

From a69016c33e5969b07426669b58e6a927c478c308 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 2 Nov 2019 10:30:16 -0700
Subject: [PATCH 08/48] improve and document numa support

---
 src/os.c               | 39 +++++++++++++++++++++++++++++----------
 test/main-override.cpp |  2 +-
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/src/os.c b/src/os.c
index 677d0ea2..fc89d642 100644
--- a/src/os.c
+++ b/src/os.c
@@ -854,8 +854,11 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
   #ifdef MI_HAS_NUMA
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     uintptr_t numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we 
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
     long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
     if (err != 0) {
       _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
@@ -883,6 +886,9 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
   return p;
 }
 
+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation 
+-----------------------------------------------------------------------------*/
 #ifdef WIN32
 static int mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
@@ -902,6 +908,9 @@ static int mi_os_numa_node_countx(void) {
 #include <stdlib.h>
 #include <numaif.h>
 static int mi_os_numa_nodex(void) {
+  #define MI_NUMA_NODE_SLOW  // too slow, so cache it
+  // TODO: perhaps use RDTSCP instruction on x64? 
+  // see <https://stackoverflow.com/questions/16862620/numa-get-current-node-core>
   #define MI_MAX_MASK (4)          // support at most 256 nodes
   unsigned long mask[MI_MAX_MASK];
   memset(mask,0,MI_MAX_MASK*sizeof(long));
@@ -945,7 +954,7 @@ static int mi_os_numa_node_countx(void) {
 #endif
 
 int _mi_os_numa_node_count(void) {
-  static int numa_node_count = 0;
+  static int numa_node_count = 0;   // cache the node count 
   if (mi_unlikely(numa_node_count <= 0)) {
     int ncount = mi_os_numa_node_countx();
     // never more than max numa node and at least 1
@@ -959,14 +968,24 @@ int _mi_os_numa_node_count(void) {
 }
 
 int _mi_os_numa_node(mi_os_tld_t* tld) {
+  int numa_node;
+#ifndef MI_NUMA_NODE_SLOW
+  UNUSED(tld);
+  numa_node = mi_os_numa_nodex();
+#else
   if (mi_unlikely(tld->numa_node < 0)) {
-    int nnode = mi_os_numa_nodex();
-    // never more than the node count
-    int ncount = _mi_os_numa_node_count();
-    if (nnode >= ncount) { nnode = nnode % ncount; }
-    if (nnode < 0) nnode = 0;
-    tld->numa_node = nnode;
+    // Cache the NUMA node of the thread if the call is slow.
+    // This may not be correct as threads can migrate to another cpu on
+    // another node -- however, for memory allocation this just means we keep
+    // using the same 'node id' for its allocations; new OS allocations
+    // naturally come from the actual node so in practice this may be fine.
+    tld->numa_node = mi_os_numa_nodex(); 
   }
-  mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count());
-  return tld->numa_node;
+  numa_node = tld->numa_node
+#endif
+  // never more than the node count and >= 0
+  int numa_count = _mi_os_numa_node_count();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  if (numa_node < 0) numa_node = 0;  
+  return numa_node;
 }
diff --git a/test/main-override.cpp b/test/main-override.cpp
index e006ad27..f7a7f1bd 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -24,7 +24,7 @@ public:
 
 
 int main() {
-  //mi_stats_reset();  // ignore earlier allocations
+  mi_stats_reset();  // ignore earlier allocations
   atexit(free_p);
   void* p1 = malloc(78);
   void* p2 = mi_malloc_aligned(16,24);

From 70748ee1ee1da3e9ad14c2d751623e47cb3fd287 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 2 Nov 2019 10:39:09 -0700
Subject: [PATCH 09/48] fix missing semi colon

---
 src/os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/os.c b/src/os.c
index fc89d642..c41d028f 100644
--- a/src/os.c
+++ b/src/os.c
@@ -981,7 +981,7 @@ int _mi_os_numa_node(mi_os_tld_t* tld) {
     // naturally come from the actual node so in practice this may be fine.
     tld->numa_node = mi_os_numa_nodex(); 
   }
-  numa_node = tld->numa_node
+  numa_node = tld->numa_node;
 #endif
   // never more than the node count and >= 0
   int numa_count = _mi_os_numa_node_count();

From fd9d8c85ae40db95feb51da6e5478850bc6722fc Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 2 Nov 2019 11:55:03 -0700
Subject: [PATCH 10/48] change numa support on linux to use getcpu

---
 include/mimalloc-types.h |  1 -
 src/init.c               |  5 ++---
 src/os.c                 | 45 +++++++++++-----------------------------
 3 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 0208d5c7..99b6b22b 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -413,7 +413,6 @@ typedef struct mi_segments_tld_s {
 // OS thread local data
 typedef struct mi_os_tld_s {
   size_t              region_idx;   // start point for next allocation
-  int                 numa_node;    // numa node associated with this thread
   mi_stats_t*         stats;        // points to tld stats
 } mi_os_tld_t;
 
diff --git a/src/init.c b/src/init.c
index 166ca451..ed15aeba 100644
--- a/src/init.c
+++ b/src/init.c
@@ -99,8 +99,8 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, -1, tld_main_stats },   // os
-  { MI_STATS_NULL }            // stats
+  { 0, tld_main_stats },   // os
+  { MI_STATS_NULL }        // stats
 };
 
 mi_heap_t _mi_heap_main = {
@@ -218,7 +218,6 @@ static bool _mi_heap_init(void) {
     memset(tld, 0, sizeof(*tld));
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
-    tld->os.numa_node = -1;
     tld->os.stats = &tld->stats;
     _mi_heap_default = heap;
   }
diff --git a/src/os.c b/src/os.c
index c41d028f..8e1b3e91 100644
--- a/src/os.c
+++ b/src/os.c
@@ -903,29 +903,21 @@ static int mi_os_numa_node_countx(void) {
   GetNumaHighestNodeNumber(&numa_max);
   return (int)(numa_max + 1);
 }
-#elif MI_HAS_NUMA
+#elif defined(__linux__)
 #include <dirent.h>
 #include <stdlib.h>
-#include <numaif.h>
+#include <sys/syscall.h>
+
 static int mi_os_numa_nodex(void) {
-  #define MI_NUMA_NODE_SLOW  // too slow, so cache it
-  // TODO: perhaps use RDTSCP instruction on x64? 
-  // see <https://stackoverflow.com/questions/16862620/numa-get-current-node-core>
-  #define MI_MAX_MASK (4)          // support at most 256 nodes
-  unsigned long mask[MI_MAX_MASK];
-  memset(mask,0,MI_MAX_MASK*sizeof(long));
-  int mode = 0;
-  long err = get_mempolicy(&mode, mask, MI_MAX_MASK*sizeof(long)*8, NULL, 0 /* thread policy */);
+#ifdef SYS_getcpu
+  unsigned node = 0;
+  unsigned ncpu = 0;
+  int err = syscall(SYS_getcpu, &ncpu, &node, NULL);
   if (err != 0) return 0;
-  // find the lowest bit that is set
-  for(int i = 0; i < MI_MAX_MASK; i++) {
-    for(int j = 0; j < (int)(sizeof(long)*8); j++) {
-      if ((mask[i] & (1UL << j)) != 0) {
-        return (i*sizeof(long)*8 + j);
-      }
-    }
-  }
-	return 0;
+  return (int)node;
+#else
+  return 0;
+#endif
 }
 
 static int mi_os_numa_node_countx(void) {
@@ -968,21 +960,8 @@ int _mi_os_numa_node_count(void) {
 }
 
 int _mi_os_numa_node(mi_os_tld_t* tld) {
-  int numa_node;
-#ifndef MI_NUMA_NODE_SLOW
   UNUSED(tld);
-  numa_node = mi_os_numa_nodex();
-#else
-  if (mi_unlikely(tld->numa_node < 0)) {
-    // Cache the NUMA node of the thread if the call is slow.
-    // This may not be correct as threads can migrate to another cpu on
-    // another node -- however, for memory allocation this just means we keep
-    // using the same 'node id' for its allocations; new OS allocations
-    // naturally come from the actual node so in practice this may be fine.
-    tld->numa_node = mi_os_numa_nodex(); 
-  }
-  numa_node = tld->numa_node;
-#endif
+  int numa_node = mi_os_numa_nodex();
   // never more than the node count and >= 0
   int numa_count = _mi_os_numa_node_count();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }

From ee323aabac42ab4333e40cedd02f0eb1d4356b4e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 2 Nov 2019 15:56:21 -0700
Subject: [PATCH 11/48] fix vs2017 build

---
 ide/vs2017/mimalloc-override.vcxproj         | 1 +
 ide/vs2017/mimalloc-override.vcxproj.filters | 3 +++
 ide/vs2017/mimalloc.vcxproj                  | 1 +
 ide/vs2017/mimalloc.vcxproj.filters          | 3 +++
 src/os.c                                     | 5 +++--
 5 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ide/vs2017/mimalloc-override.vcxproj b/ide/vs2017/mimalloc-override.vcxproj
index 511c0fab..1fc70b33 100644
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@@ -231,6 +231,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/ide/vs2017/mimalloc-override.vcxproj.filters b/ide/vs2017/mimalloc-override.vcxproj.filters
index 6ac0c0b5..75a8e032 100644
--- a/ide/vs2017/mimalloc-override.vcxproj.filters
+++ b/ide/vs2017/mimalloc-override.vcxproj.filters
@@ -70,5 +70,8 @@
     <ClCompile Include="..\..\src\alloc-posix.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/ide/vs2017/mimalloc.vcxproj b/ide/vs2017/mimalloc.vcxproj
index 6147c349..484c4db8 100644
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@@ -217,6 +217,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/ide/vs2017/mimalloc.vcxproj.filters b/ide/vs2017/mimalloc.vcxproj.filters
index a2b64314..598b8643 100644
--- a/ide/vs2017/mimalloc.vcxproj.filters
+++ b/ide/vs2017/mimalloc.vcxproj.filters
@@ -53,6 +53,9 @@
     <ClCompile Include="..\..\src\alloc-posix.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
diff --git a/src/os.c b/src/os.c
index 8e1b3e91..4aa4abf3 100644
--- a/src/os.c
+++ b/src/os.c
@@ -794,6 +794,7 @@ and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
 {
   mi_assert_internal(size%GiB == 0);
+  mi_win_enable_large_os_pages();
 
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
@@ -812,7 +813,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
     params[0].Pointer = &reqs;
     params[1].Type = 5; // == MemExtendedParameterAttributeFlags;
     params[1].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    size_t param_count = 2;
+    ULONG param_count = 2;
     if (numa_node >= 0) {
       param_count++;
       params[2].Type = MemExtendedParameterNumaNode;
@@ -833,7 +834,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
   if (pVirtualAlloc2 != NULL) {
     params[0].Type = MemExtendedParameterAddressRequirements;
     params[0].Pointer = &reqs;
-    size_t param_count = 1;
+    ULONG param_count = 1;
     if (numa_node >= 0) {
       param_count++;
       params[1].Type = MemExtendedParameterNumaNode;

From 62cd0237fc8583f357fe4599889011f845690af1 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 2 Nov 2019 17:49:34 -0700
Subject: [PATCH 12/48] fix aligned huge page allocation on windows

---
 src/arena.c |   2 +-
 src/os.c    | 118 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 76 insertions(+), 44 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 7eb755c4..56b09859 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -435,7 +435,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
 
 int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
   UNUSED(max_secs);
-  _mi_verbose_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
   int err = mi_reserve_huge_os_pages_interleave(pages);  
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
diff --git a/src/os.c b/src/os.c
index 4aa4abf3..e1dc31f8 100644
--- a/src/os.c
+++ b/src/os.c
@@ -791,68 +791,68 @@ and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 #define MI_HUGE_OS_PAGE_SIZE  (GiB)
 
 #if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 {
   mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(addr != NULL);
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
   mi_win_enable_large_os_pages();
-
+  
+  void* p = NULL;
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
-  MEM_EXTENDED_PARAMETER params[4] = { {0,0},{0,0},{0,0},{0,0} };
-  MEM_ADDRESS_REQUIREMENTS reqs = {0,0,0};
-  reqs.HighestEndingAddress = NULL;
-  reqs.LowestStartingAddress = NULL;
-  reqs.Alignment = MI_SEGMENT_SIZE;
-
+  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };  
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   if (pNtAllocateVirtualMemoryEx != NULL) {
     #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
     #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
     #endif
-    params[0].Type = MemExtendedParameterAddressRequirements;
-    params[0].Pointer = &reqs;
-    params[1].Type = 5; // == MemExtendedParameterAttributeFlags;
-    params[1].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    ULONG param_count = 2;
-    if (numa_node >= 0) {
-      param_count++;
-      params[2].Type = MemExtendedParameterNumaNode;
-      params[2].ULong = (unsigned)numa_node;
-    }
-    SIZE_T psize = size;
-    void* base = NULL;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
-    if (err == 0) {
-      return base;
-    }
-    else {
-      // fall back to regular huge pages
-      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err);
-    }
-  }
-  // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation
-  if (pVirtualAlloc2 != NULL) {
-    params[0].Type = MemExtendedParameterAddressRequirements;
-    params[0].Pointer = &reqs;
+    params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
+    params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
     ULONG param_count = 1;
     if (numa_node >= 0) {
       param_count++;
       params[1].Type = MemExtendedParameterNumaNode;
       params[1].ULong = (unsigned)numa_node;
     }
-    return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count);
+    SIZE_T psize = size;
+    void* base = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
   }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type = MemExtendedParameterNumaNode;
+    params[0].ULong = (unsigned)numa_node;    
+    p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+  else 
   #endif
-  return NULL; // give up on older Windows..
+  // use regular virtual alloc on older windows
+  {
+    p = VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+  }
+
+  if (p == NULL) {
+    _mi_warning_message("failed to allocate huge OS pages (size %zu) (error %d)\n", size, GetLastError());
+  }
+  return p;
 }
+
 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
 #ifdef MI_HAS_NUMA
 #include <numaif.h> // mbind, and use -lnuma
 #endif
-static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
   mi_assert_internal(size%GiB == 0);
   bool is_large = true;
-  void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
   #ifdef MI_HAS_NUMA
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
@@ -871,19 +871,51 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   return p;
 }
 #else
-static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
   return NULL;
 }
 #endif
 
+// To ensure proper alignment, use our own area for huge OS pages
+static _Atomic(uintptr_t)  mi_huge_start; // = 0
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
   if (psize != NULL) *psize = 0;
-  size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
-  void* p = mi_os_alloc_huge_os_pagesx(size, numa_node);
-  if (p==NULL) return NULL;
-  if (psize != NULL) *psize = size;
+  const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+
+  // Find a new aligned address for the huge pages
+  uintptr_t start = 0;
+  uintptr_t end = 0;
+  uintptr_t expected;
+  do {
+    start = expected = mi_atomic_read_relaxed(&mi_huge_start);    
+    if (start == 0) {
+      // Initialize the start address after the 32TiB area
+      start = ((uintptr_t)32 << 40);    // 32TiB virtual start address
+      #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_random_init((uintptr_t)&_mi_os_alloc_huge_os_pages);
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+      #endif
+    }
+    end = start + size;
+    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected));
+
+  // And allocate
+  void* p = mi_os_alloc_huge_os_pagesx((void*)start, size, numa_node);
+  if (p == NULL) {
+    return NULL;
+  }
   _mi_stat_increase(&_mi_stats_main.committed, size);
   _mi_stat_increase(&_mi_stats_main.reserved, size);
+  if ((uintptr_t)p % MI_SEGMENT_SIZE != 0) { // must be aligned
+    _mi_warning_message("huge page area was not aligned\n");
+    _mi_os_free(p,size,&_mi_stats_main);
+    return NULL;
+  }
+  
+  if (psize != NULL) *psize = size;
   return p;
 }
 

From 723fbba2596e663b6dac40da5e486c0ac52501f3 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 12:18:20 -0800
Subject: [PATCH 13/48] fix output during preloading enabling stderr only after
 the crt has loaded

---
 src/options.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/options.c b/src/options.c
index 32f13d54..3a7833a2 100644
--- a/src/options.c
+++ b/src/options.c
@@ -14,6 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <ctype.h>  // toupper
 #include <stdarg.h>
 
+static void mi_add_stderr_output();
+
 int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
@@ -73,7 +75,9 @@ static mi_option_desc_t options[_mi_option_last] =
 static void mi_option_init(mi_option_desc_t* desc);
 
 void _mi_options_init(void) {
-  // called on process load
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     mi_option_get(option); // initialize
@@ -135,7 +139,7 @@ static void mi_out_stderr(const char* msg) {
   #ifdef _WIN32
   // on windows with redirection, the C runtime cannot handle locale dependent output 
   // after the main thread closes so we use direct console output.
-  _cputs(msg);
+  if (!_mi_preloading()) { _cputs(msg); }
   #else
   fputs(msg, stderr);
   #endif
@@ -166,23 +170,29 @@ static void mi_out_buf(const char* msg) {
   memcpy(&out_buf[start], msg, n);
 }
 
-static void mi_out_buf_flush(mi_output_fun* out) {
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {
   if (out==NULL) return;
-  // claim all (no more output will be added after this point)
-  size_t count = mi_atomic_addu(&out_len, MI_MAX_DELAY_OUTPUT);
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
   // and output the current contents
   if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
   out_buf[count] = 0;
   out(out_buf);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline    
+  }
 }
 
-// The initial default output, outputs to stderr and the delayed output buffer.
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
 static void mi_out_buf_stderr(const char* msg) {
   mi_out_stderr(msg);
   mi_out_buf(msg);
 }
 
 
+
 // --------------------------------------------------------
 // Default output handler
 // --------------------------------------------------------
@@ -194,14 +204,19 @@ static mi_output_fun* volatile mi_out_default; // = NULL
 
 static mi_output_fun* mi_out_get_default(void) {
   mi_output_fun* out = mi_out_default;
-  return (out == NULL ? &mi_out_buf_stderr : out);
+  return (out == NULL ? &mi_out_buf : out);
 }
 
 void mi_register_output(mi_output_fun* out) mi_attr_noexcept {
   mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  if (out!=NULL) mi_out_buf_flush(out);  // output the delayed output now
+  if (out!=NULL) mi_out_buf_flush(out,true);             // output all the delayed output now
 }
 
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_out_buf_flush(&mi_out_stderr, false); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;     // and add stderr to the delayed output
+}
 
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
@@ -214,7 +229,7 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 static mi_decl_thread bool recurse = false;
 
 void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) out = mi_out_get_default();
   recurse = true;
   if (prefix != NULL) out(prefix);
@@ -228,7 +243,7 @@ void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
 static void mi_vfprintf( mi_output_fun* out, const char* prefix, const char* fmt, va_list args ) {
   char buf[512];
   if (fmt==NULL) return;
-  if (_mi_preloading() || recurse) return;
+  if (recurse) return;
   recurse = true;
   vsnprintf(buf,sizeof(buf)-1,fmt,args);
   recurse = false;

From e32048879183c2672db7d06138ca6f4eb80ebfa1 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 12:18:32 -0800
Subject: [PATCH 14/48] add numa nodes to stats

---
 include/mimalloc-internal.h | 2 +-
 src/os.c                    | 7 +++++--
 src/stats.c                 | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index b4d3351d..c28cf0fd 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -57,7 +57,7 @@ void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocat
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
 int        _mi_os_numa_node(mi_os_tld_t* tld);
-
+int        _mi_os_numa_node_count(void);
 
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
diff --git a/src/os.c b/src/os.c
index e1dc31f8..af3c440c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -840,7 +840,8 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   }
 
   if (p == NULL) {
-    _mi_warning_message("failed to allocate huge OS pages (size %zu) (error %d)\n", size, GetLastError());
+    DWORD winerr = GetLastError();
+    _mi_warning_message("failed to allocate huge OS pages (size %zu) (windows error %d%s)\n", size, winerr, (winerr==1450 ? " (insufficient resources)" : ""));
   }
   return p;
 }
@@ -981,12 +982,14 @@ static int mi_os_numa_node_countx(void) {
 int _mi_os_numa_node_count(void) {
   static int numa_node_count = 0;   // cache the node count 
   if (mi_unlikely(numa_node_count <= 0)) {
-    int ncount = mi_os_numa_node_countx();
+    int ncount = mi_os_numa_node_countx();    
+    int ncount0 = ncount;
     // never more than max numa node and at least 1
     int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
     if (ncount > nmax) ncount = nmax;
     if (ncount <= 0)   ncount = 1;
     numa_node_count = ncount;
+    _mi_verbose_message("using %i numa regions (%i nodes detected)\n", numa_node_count, ncount0);
   }
   mi_assert_internal(numa_node_count >= 1);
   return numa_node_count;
diff --git a/src/stats.c b/src/stats.c
index 50bd029d..79362cc4 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -265,7 +265,7 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
   mi_stat_counter_print(&stats->commit_calls, "commits", out);
   mi_stat_print(&stats->threads, "threads", -1, out);
   mi_stat_counter_print_avg(&stats->searches, "searches", out);
-
+  _mi_fprintf(out, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
   if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
 
   double user_time;

From f36ec5d9d8275777e05526468524dfd9d433164e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 13:16:07 -0800
Subject: [PATCH 15/48] reserve huge pages incrementally

---
 src/arena.c   |  23 ++++++----
 src/options.c |   1 -
 src/os.c      | 120 ++++++++++++++++++++++++++++++++++----------------
 3 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 56b09859..24fd2114 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -27,7 +27,10 @@ with on-demand coalescing.
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
 //int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize);
+
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, size_t* pages_reserved, size_t* psize);
+void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
+
 int   _mi_os_numa_node_count(void);
 
 /* -----------------------------------------------------------
@@ -234,12 +237,12 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
   void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
   if (p != NULL) {
     mi_assert_internal(block_index != SIZE_MAX);
-#if MI_DEBUG>=1
+    #if MI_DEBUG>=1
     _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
     mi_block_info_t binfo = mi_atomic_read(block);
     mi_assert_internal(mi_block_is_in_use(binfo));
     mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
-#endif
+    #endif
     *memid = mi_memid_create(arena_index, block_index);
     *commit = true;           // TODO: support commit on demand?
     *large = arena->is_large;
@@ -382,18 +385,22 @@ static bool mi_arena_add(mi_arena_t* arena) {
 
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
-  size_t hsize = 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, &hsize);
-  if (p==NULL) return ENOMEM;
-  _mi_verbose_message("reserved %zu huge (1GiB) pages\n", pages);
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, (double)pages / 2.0, &pages_reserved, &hsize);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
   
   size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
   size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
   mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) {
-    _mi_os_free(p, hsize, &_mi_stats_main);
+    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
     return ENOMEM;
   }
   arena->block_count = bcount;
diff --git a/src/options.c b/src/options.c
index 3a7833a2..11d12187 100644
--- a/src/options.c
+++ b/src/options.c
@@ -221,7 +221,6 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-#define MAX_ERROR_COUNT (10)
 static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
 
 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
diff --git a/src/os.c b/src/os.c
index af3c440c..5947333d 100644
--- a/src/os.c
+++ b/src/os.c
@@ -339,7 +339,8 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
       lflags |= MAP_HUGETLB;
       #endif
       #ifdef MAP_HUGE_1GB
-      if ((size % GiB) == 0) {
+      static bool mi_huge_pages_available = true;
+      if ((size % GiB) == 0 && mi_huge_pages_available) {
         lflags |= MAP_HUGE_1GB;
       }
       else
@@ -358,6 +359,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
         p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
           p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
@@ -799,11 +801,11 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 
   mi_win_enable_large_os_pages();
   
-  void* p = NULL;
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };  
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  if (pNtAllocateVirtualMemoryEx != NULL) {
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
     #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
     #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
     #endif
@@ -822,7 +824,8 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
       return base;
     }
     else {
-      // fall back to regular huge pages
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
       _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
     }
   }
@@ -830,20 +833,11 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   if (pVirtualAlloc2 != NULL && numa_node >= 0) {
     params[0].Type = MemExtendedParameterNumaNode;
     params[0].ULong = (unsigned)numa_node;    
-    p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
-  else 
   #endif
-  // use regular virtual alloc on older windows
-  {
-    p = VirtualAlloc(addr, size, flags, PAGE_READWRITE);
-  }
-
-  if (p == NULL) {
-    DWORD winerr = GetLastError();
-    _mi_warning_message("failed to allocate huge OS pages (size %zu) (windows error %d%s)\n", size, winerr, (winerr==1450 ? " (insufficient resources)" : ""));
-  }
-  return p;
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
 
 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
@@ -880,44 +874,92 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 // To ensure proper alignment, use our own area for huge OS pages
 static _Atomic(uintptr_t)  mi_huge_start; // = 0
 
-// Allocate MI_SEGMENT_SIZE aligned huge pages
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
-  if (psize != NULL) *psize = 0;
+// Claim an aligned address range for huge pages
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
   const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
 
-  // Find a new aligned address for the huge pages
   uintptr_t start = 0;
   uintptr_t end = 0;
   uintptr_t expected;
   do {
-    start = expected = mi_atomic_read_relaxed(&mi_huge_start);    
+    start = expected = mi_atomic_read_relaxed(&mi_huge_start);
     if (start == 0) {
       // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);    // 32TiB virtual start address
-      #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_random_init((uintptr_t)&_mi_os_alloc_huge_os_pages);
+      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_random_init((uintptr_t)&mi_os_claim_huge_pages);
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
-      #endif
+#endif
     }
     end = start + size;
     mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
   } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected));
 
-  // And allocate
-  void* p = mi_os_alloc_huge_os_pagesx((void*)start, size, numa_node);
-  if (p == NULL) {
-    return NULL;
-  }
-  _mi_stat_increase(&_mi_stats_main.committed, size);
-  _mi_stat_increase(&_mi_stats_main.reserved, size);
-  if ((uintptr_t)p % MI_SEGMENT_SIZE != 0) { // must be aligned
-    _mi_warning_message("huge page area was not aligned\n");
-    _mi_os_free(p,size,&_mi_stats_main);
-    return NULL;
-  }
+  if (total_size != NULL) *total_size = size;
+  return (uint8_t*)start;
+}
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, size_t* pages_reserved, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  size_t size = 0;
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
   
-  if (psize != NULL) *psize = size;
-  return p;
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  // or to at least allocate as many as available on the system.
+  double start_t = _mi_clock_start();
+  size_t page;
+  for (page = 0; page < pages; page++) {
+    // allocate a page
+    bool  is_large = true;
+    void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
+    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and break
+      if (p != NULL) {
+        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr);
+        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+      }
+      break;
+    }
+    
+    // success, record it
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    
+    // check for timeout
+    double elapsed = _mi_clock_end(start_t);
+    if (page >= 1) {
+      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
+      if (estimate > 1.5*max_secs) { // seems like we are going to timeout, break
+        elapsed = max_secs + 1.0; 
+      }
+    }
+    if (elapsed > max_secs) {
+      _mi_warning_message("huge page allocation timed out\n");
+      break;
+    }
+  }
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  if (pages_reserved != NULL) *pages_reserved = page;
+  if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
+  return (page == 0 ? NULL : start);
+}
+
+// free every huge page in a range individually (as we allocated per page)
+// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
+void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+  if (p==NULL || size==0) return;
+  uint8_t* base = (uint8_t*)p;
+  while (size >= MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    size -= MI_HUGE_OS_PAGE_SIZE;
+  }
 }
 
 /* ----------------------------------------------------------------------------

From 520a8dafee0747e1da8b220b28b35298f10512b2 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 13:25:28 -0800
Subject: [PATCH 16/48] divide huge pages more even

---
 src/arena.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 24fd2114..95a102d1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -385,6 +385,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
 
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
+  if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
@@ -422,18 +423,20 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
   // pages per numa node
   int numa_count = _mi_os_numa_node_count();
   if (numa_count <= 0) numa_count = 1;
-  size_t pages_per = pages / numa_count;
-  if (pages_per == 0) pages_per = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
   
   // reserve evenly among numa nodes
   for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    int err = mi_reserve_huge_os_pages_at((pages_per > pages ? pages : pages_per), numa_node);
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node);
     if (err) return err;
-    if (pages < pages_per) {
+    if (pages < node_pages) {
       pages = 0;
     }
     else {
-      pages -= pages_per;
+      pages -= node_pages;
     }
   }
 

From d1d65fbca4d037c5b9cc0838074804fde1f505c7 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 13:25:41 -0800
Subject: [PATCH 17/48] make max error messages configurable

---
 include/mimalloc.h |  1 +
 src/options.c      | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index b155aca6..c03ddc1e 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -277,6 +277,7 @@ typedef enum mi_option_e {
   mi_option_segment_reset,
   mi_option_os_tag,
   mi_option_max_numa_node,
+  mi_option_max_errors,
   _mi_option_last
 } mi_option_t;
 
diff --git a/src/options.c b/src/options.c
index 11d12187..63b1612a 100644
--- a/src/options.c
+++ b/src/options.c
@@ -14,6 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <ctype.h>  // toupper
 #include <stdarg.h>
 
+static uintptr_t mi_max_error_count = 16;  // stop outputting errors after this
+
 static void mi_add_stderr_output();
 
 int mi_version(void) mi_attr_noexcept {
@@ -69,7 +71,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 256, UNINIT, MI_OPTION(max_numa_node) }      // maximum allowed numa node
+  { 256, UNINIT, MI_OPTION(max_numa_node) },     // maximum allowed numa node
+  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -86,6 +89,7 @@ void _mi_options_init(void) {
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
     }
   }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
 }
 
 long mi_option_get(mi_option_t option) {
@@ -275,7 +279,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 
 void _mi_error_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: error: ", fmt, args);
@@ -285,7 +289,7 @@ void _mi_error_message(const char* fmt, ...) {
 
 void _mi_warning_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > MAX_ERROR_COUNT) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, "mimalloc: warning: ", fmt, args);

From 9d6a5acb228db9cd4ae8f50ef2295e9b5d57e3c8 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 3 Nov 2019 13:34:54 -0800
Subject: [PATCH 18/48] fix unix build warnings

---
 CMakeLists.txt | 5 +++--
 src/arena.c    | 2 +-
 src/heap.c     | 2 +-
 src/os.c       | 1 -
 src/page.c     | 2 +-
 src/segment.c  | 6 ++++--
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e96c237..12540f68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,5 @@
 cmake_minimum_required(VERSION 3.0)
 project(libmimalloc C CXX)
-include("cmake/mimalloc-config-version.cmake")
-include("CheckIncludeFile")
 
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
@@ -15,6 +13,9 @@ option(MI_SECURE            "Use security mitigations (like guard pages and rand
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 
+include("cmake/mimalloc-config-version.cmake")
+include("CheckIncludeFile")
+
 set(mi_install_dir "lib/mimalloc-${mi_version}")
 
 set(mi_sources
diff --git a/src/arena.c b/src/arena.c
index 95a102d1..08a36415 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -429,7 +429,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
   // reserve evenly among numa nodes
   for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
+    if ((size_t)numa_node < pages_mod) node_pages++;
     int err = mi_reserve_huge_os_pages_at(node_pages, numa_node);
     if (err) return err;
     if (pages < node_pages) {
diff --git a/src/heap.c b/src/heap.c
index 15c5d02a..162cf406 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -45,7 +45,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
 }
 
 
-#if MI_DEBUG>1
+#if MI_DEBUG>=3
 static bool _mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
   UNUSED(arg1);
   UNUSED(arg2);
diff --git a/src/os.c b/src/os.c
index 5947333d..3f299362 100644
--- a/src/os.c
+++ b/src/os.c
@@ -914,7 +914,6 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, s
   size_t page;
   for (page = 0; page < pages; page++) {
     // allocate a page
-    bool  is_large = true;
     void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
     void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
 
diff --git a/src/page.c b/src/page.c
index f7fad764..32b68edb 100644
--- a/src/page.c
+++ b/src/page.c
@@ -38,7 +38,7 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_stats_t* stats);
 
 
-#if (MI_DEBUG>1)
+#if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
   size_t count = 0;
   while (head != NULL) {
diff --git a/src/segment.c b/src/segment.c
index dcc6a04b..178e0eda 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -41,7 +41,7 @@ terms of the MIT license. A copy of the license can be found in the file
 ----------------------------------------------------------- */
 
 
-#if (MI_DEBUG>1)
+#if (MI_DEBUG>=3)
 static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, mi_segment_t* segment) {
   mi_assert_internal(segment != NULL);
   mi_segment_t* list = queue->first;
@@ -111,7 +111,7 @@ static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_t
  Invariant checking
 ----------------------------------------------------------- */
 
-#if (MI_DEBUG > 1)
+#if (MI_DEBUG>=2)
 static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld);
   bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
@@ -120,7 +120,9 @@ static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t
   }
   return in_queue;
 }
+#endif
 
+#if (MI_DEBUG>=3)
 static size_t mi_segment_pagesize(mi_segment_t* segment) {
   return ((size_t)1 << segment->page_shift);
 }

From 8afd06b248f6a82763292821bf5096e35f6a5a0b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 4 Nov 2019 08:44:40 -0800
Subject: [PATCH 19/48] use int64 for time (instead of double)

---
 include/mimalloc-internal.h |  7 ++-
 src/arena.c                 |  4 +-
 src/memory.c                |  1 +
 src/os.c                    | 22 ++++++---
 src/stats.c                 | 95 ++++++++++++++++++-------------------
 5 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index c28cf0fd..413f76e6 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -106,8 +106,11 @@ uintptr_t  _mi_heap_random(mi_heap_t* heap);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
-double     _mi_clock_end(double start);
-double     _mi_clock_start(void);
+
+typedef int64_t mi_msecs_t;
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
 
 // "alloc.c"
 void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
diff --git a/src/arena.c b/src/arena.c
index 08a36415..6faf7d3e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -28,7 +28,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
 //int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, size_t* pages_reserved, size_t* psize);
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
 void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
 
 int   _mi_os_numa_node_count(void);
@@ -390,7 +390,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
   size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, (double)pages / 2.0, &pages_reserved, &hsize);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, pages*500, &pages_reserved, &hsize);
   if (p==NULL || pages_reserved==0) {
     _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
     return ENOMEM;
diff --git a/src/memory.c b/src/memory.c
index a425393c..75a1df92 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -564,6 +564,7 @@ void _mi_mem_collect(mi_stats_t* stats) {
   }
 }
 
+
 /* ----------------------------------------------------------------------------
   Other
 -----------------------------------------------------------------------------*/
diff --git a/src/os.c b/src/os.c
index 3f299362..44ef9830 100644
--- a/src/os.c
+++ b/src/os.c
@@ -871,6 +871,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 }
 #endif
 
+#if (MI_INTPTR_SIZE >= 8) 
 // To ensure proper alignment, use our own area for huge OS pages
 static _Atomic(uintptr_t)  mi_huge_start; // = 0
 
@@ -899,18 +900,25 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
 }
+#else
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  return NULL;
+}
+#endif
 
 // Allocate MI_SEGMENT_SIZE aligned huge pages
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, size_t* pages_reserved, size_t* psize) {
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
   uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  if (start == NULL) return NULL; // or 32-bit systems
   
   // Allocate one page at the time but try to place them contiguously
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
-  double start_t = _mi_clock_start();
+  mi_msecs_t start_t = _mi_clock_start();
   size_t page;
   for (page = 0; page < pages; page++) {
     // allocate a page
@@ -932,14 +940,14 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, double max_secs, s
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
     
     // check for timeout
-    double elapsed = _mi_clock_end(start_t);
+    mi_msecs_t elapsed = _mi_clock_end(start_t);
     if (page >= 1) {
-      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
-      if (estimate > 1.5*max_secs) { // seems like we are going to timeout, break
-        elapsed = max_secs + 1.0; 
+      mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+      if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+        elapsed = max_msecs + 1; 
       }
     }
-    if (elapsed > max_secs) {
+    if (elapsed > max_msecs) {
       _mi_warning_message("huge page allocation timed out\n");
       break;
     }
diff --git a/src/stats.c b/src/stats.c
index 79362cc4..a1248043 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -231,9 +231,9 @@ static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bin
 #endif
 
 
-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);
 
-static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out) mi_attr_noexcept {
+static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) mi_attr_noexcept {
   mi_print_header(out);
   #if MI_STAT>1
   mi_stat_count_t normal = { 0,0,0,0 };
@@ -266,16 +266,16 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
   mi_stat_print(&stats->threads, "threads", -1, out);
   mi_stat_counter_print_avg(&stats->searches, "searches", out);
   _mi_fprintf(out, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
-  if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
+  if (elapsed > 0) _mi_fprintf(out, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
 
-  double user_time;
-  double sys_time;
+  mi_msecs_t user_time;
+  mi_msecs_t sys_time;
   size_t peak_rss;
   size_t page_faults;
   size_t page_reclaim;
   size_t peak_commit;
   mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim, &peak_commit);
-  _mi_fprintf(out,"%10s: user: %.3f s, system: %.3f s, faults: %lu, reclaims: %lu, rss: ", "process", user_time, sys_time, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  _mi_fprintf(out,"%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, reclaims: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults, (unsigned long)page_reclaim );
   mi_printf_amount((int64_t)peak_rss, 1, out, "%s");
   if (peak_commit > 0) {
     _mi_fprintf(out,", commit charge: ");
@@ -284,9 +284,7 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
   _mi_fprintf(out,"\n");
 }
 
-double _mi_clock_end(double start);
-double _mi_clock_start(void);
-static double mi_time_start = 0.0;
+static mi_msecs_t mi_time_start; // = 0
 
 static mi_stats_t* mi_stats_get_default(void) {
   mi_heap_t* heap = mi_heap_get_default();
@@ -316,71 +314,72 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }
 
 
-static void mi_stats_print_ex(mi_stats_t* stats, double secs, mi_output_fun* out) {
+static void mi_stats_print_ex(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) {
   mi_stats_merge_from(stats);
-  _mi_stats_print(&_mi_stats_main, secs, out);
+  _mi_stats_print(&_mi_stats_main, elapsed, out);
 }
 
 void mi_stats_print(mi_output_fun* out) mi_attr_noexcept {
-  mi_stats_print_ex(mi_stats_get_default(),_mi_clock_end(mi_time_start),out);
+  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
+  mi_stats_print_ex(mi_stats_get_default(),elapsed,out);
 }
 
 void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), _mi_clock_end(mi_time_start), out);
+  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
+  _mi_stats_print(mi_stats_get_default(), elapsed, out);
 }
 
 
-
-// --------------------------------------------------------
-// Basic timer for convenience
-// --------------------------------------------------------
-
+// ----------------------------------------------------------------
+// Basic timer for convenience; use milli-seconds to avoid doubles
+// ----------------------------------------------------------------
 #ifdef _WIN32
 #include <windows.h>
-static double mi_to_seconds(LARGE_INTEGER t) {
-  static double freq = 0.0;
-  if (freq <= 0.0) {
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
     LARGE_INTEGER f;
     QueryPerformanceFrequency(&f);
-    freq = (double)(f.QuadPart);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
   }
-  return ((double)(t.QuadPart) / freq);
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
 }
 
-static double mi_clock_now(void) {
+mi_msecs_t _mi_clock_now(void) {
   LARGE_INTEGER t;
   QueryPerformanceCounter(&t);
-  return mi_to_seconds(t);
+  return mi_to_msecs(t);
 }
 #else
 #include <time.h>
 #ifdef CLOCK_REALTIME
-static double mi_clock_now(void) {
+mi_msecs_t _mi_clock_now(void) {
   struct timespec t;
   clock_gettime(CLOCK_REALTIME, &t);
-  return (double)t.tv_sec + (1.0e-9 * (double)t.tv_nsec);
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
 }
 #else
 // low resolution timer
-static double mi_clock_now(void) {
-  return ((double)clock() / (double)CLOCKS_PER_SEC);
+mi_msecs_t _mi_clock_now(void) {
+  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
 }
 #endif
 #endif
 
 
-static double mi_clock_diff = 0.0;
+static mi_msecs_t mi_clock_diff;
 
-double _mi_clock_start(void) {
+mi_msecs_t _mi_clock_start(void) {
   if (mi_clock_diff == 0.0) {
-    double t0 = mi_clock_now();
-    mi_clock_diff = mi_clock_now() - t0;
+    mi_msecs_t t0 = _mi_clock_now();
+    mi_clock_diff = _mi_clock_now() - t0;
   }
-  return mi_clock_now();
+  return _mi_clock_now();
 }
 
-double _mi_clock_end(double start) {
-  double end = mi_clock_now();
+mi_msecs_t _mi_clock_end(mi_msecs_t start) {
+  mi_msecs_t end = _mi_clock_now();
   return (end - start - mi_clock_diff);
 }
 
@@ -394,21 +393,21 @@ double _mi_clock_end(double start) {
 #include <psapi.h>
 #pragma comment(lib,"psapi.lib")
 
-static double filetime_secs(const FILETIME* ftime) {
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
   ULARGE_INTEGER i;
   i.LowPart = ftime->dwLowDateTime;
   i.HighPart = ftime->dwHighDateTime;
-  double secs = (double)(i.QuadPart) * 1.0e-7; // FILETIME is in 100 nano seconds
-  return secs;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
 }
-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
   FILETIME ct;
   FILETIME ut;
   FILETIME st;
   FILETIME et;
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
-  *utime = filetime_secs(&ut);
-  *stime = filetime_secs(&st);
+  *utime = filetime_msecs(&ut);
+  *stime = filetime_msecs(&st);
 
   PROCESS_MEMORY_COUNTERS info;
   GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
@@ -427,11 +426,11 @@ static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size
 #include <mach/mach.h>
 #endif
 
-static double timeval_secs(const struct timeval* tv) {
-  return (double)tv->tv_sec + ((double)tv->tv_usec * 1.0e-6);
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
 }
 
-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
   struct rusage rusage;
   getrusage(RUSAGE_SELF, &rusage);
 #if defined(__APPLE__) && defined(__MACH__)
@@ -452,12 +451,12 @@ static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size
 #pragma message("define a way to get process info")
 #endif
 
-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
   *peak_rss = 0;
   *page_faults = 0;
   *page_reclaim = 0;
   *peak_commit = 0;
-  *utime = 0.0;
-  *stime = 0.0;
+  *utime = 0;
+  *stime = 0;
 }
 #endif

From 3d0a1e249fa113e93792838a00a7acd9fc98aa34 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 4 Nov 2019 09:40:10 -0800
Subject: [PATCH 20/48] remove all floating point types and arithmetic

---
 src/arena.c |  1 -
 src/init.c  |  3 +--
 src/stats.c | 32 +++++++++++++++++++-------------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 6faf7d3e..e58d2c47 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -25,7 +25,6 @@ with on-demand coalescing.
 
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-//int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
diff --git a/src/init.c b/src/init.c
index ed15aeba..ef848de4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -433,8 +433,7 @@ static void mi_process_load(void) {
   }
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
-    // double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
+    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);    
     mi_reserve_huge_os_pages_interleave(pages);
   }
 }
diff --git a/src/stats.c b/src/stats.c
index a1248043..011fab64 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -130,19 +130,23 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const
   char buf[32];
   int  len = 32;
   const char* suffix = (unit <= 0 ? " " : "b");
-  double base = (unit == 0 ? 1000.0 : 1024.0);
+  const int64_t base = (unit == 0 ? 1000 : 1024);
   if (unit>0) n *= unit;
 
-  double pos = (double)(n < 0 ? -n : n);
-  if (pos < base)
-    snprintf(buf,len, "%d %s ", (int)n, suffix);
-  else if (pos < base*base)
-    snprintf(buf, len, "%.1f k%s", (double)n / base, suffix);
-  else if (pos < base*base*base)
-    snprintf(buf, len, "%.1f m%s", (double)n / (base*base), suffix);
-  else
-    snprintf(buf, len, "%.1f g%s", (double)n / (base*base*base), suffix);
-
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    snprintf(buf, len, "%d %s ", (int)n, suffix);
+  }
+  else {
+    int64_t divider = base;
+    const char* magnitude = "k";
+    if (pos >= divider*base) { divider *= base; magnitude = "m"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "g"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    snprintf(buf, len, "%ld.%ld %s%s", whole, frac1, magnitude, suffix);
+  }
   _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
 }
 
@@ -199,8 +203,10 @@ static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg
 }
 
 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out) {
-  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
-  _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg);
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg %ld %ld\n", msg, avg_whole, avg_frac1);
 }
 
 

From 829fd872f407c5e201cd844b8f26f2c87915e89b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 4 Nov 2019 11:48:41 -0800
Subject: [PATCH 21/48] initial delay slots

---
 include/mimalloc-internal.h |  11 ++-
 include/mimalloc-types.h    |  26 +++++--
 include/mimalloc.h          |   1 +
 src/heap.c                  |   2 +-
 src/init.c                  |   4 +-
 src/memory.c                | 143 +++++++++++++++++++++++++++++++-----
 src/options.c               |   1 +
 src/segment.c               |  31 ++++----
 src/stats.c                 |   2 +-
 9 files changed, 171 insertions(+), 50 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 413f76e6..25a3d93d 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -61,15 +61,15 @@ int        _mi_os_numa_node_count(void);
 
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
-void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
+void       _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld);
 
-bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
+bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
+bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
 bool       _mi_mem_protect(void* addr, size_t size);
 bool       _mi_mem_unprotect(void* addr, size_t size);
 
-void        _mi_mem_collect(mi_stats_t* stats);
+void        _mi_mem_collect(mi_os_tld_t* tld);
 
 // "segment.c"
 mi_page_t* _mi_segment_page_alloc(size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
@@ -107,7 +107,6 @@ uintptr_t  _mi_heap_random(mi_heap_t* heap);
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
 
-typedef int64_t mi_msecs_t;
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..8a3ffff4 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -385,6 +385,19 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
 
+// ------------------------------------------------------
+// Delay slots (to avoid expensive OS calls)
+// ------------------------------------------------------
+typedef int64_t mi_msecs_t;
+
+typedef struct mi_delay_slot_s {
+  mi_msecs_t expire;
+  uint8_t*   addr;
+  size_t     size;
+} mi_delay_slot_t;
+
+#define MI_RESET_DELAY_SLOTS (128)
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
@@ -395,6 +408,12 @@ typedef struct mi_segment_queue_s {
   mi_segment_t* last;
 } mi_segment_queue_t;
 
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t              region_idx;   // start point for next allocation
+  mi_stats_t* stats;        // points to tld stats
+  mi_delay_slot_t     reset_delay[MI_RESET_DELAY_SLOTS];
+} mi_os_tld_t;
 
 // Segments thread local data
 typedef struct mi_segments_tld_s {
@@ -408,14 +427,9 @@ typedef struct mi_segments_tld_s {
   size_t              cache_size;   // total size of all segments in the cache
   mi_segment_t*       cache;        // (small) cache of segments
   mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;
 
-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t              region_idx;   // start point for next allocation
-  mi_stats_t*         stats;        // points to tld stats
-} mi_os_tld_t;
-
 // Thread local data
 struct mi_tld_s {
   unsigned long long  heartbeat;     // monotonic heartbeat count
diff --git a/include/mimalloc.h b/include/mimalloc.h
index c03ddc1e..e6fa9c2b 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -275,6 +275,7 @@ typedef enum mi_option_e {
   mi_option_reset_decommits,
   mi_option_eager_commit_delay,
   mi_option_segment_reset,
+  mi_option_reset_delay,
   mi_option_os_tag,
   mi_option_max_numa_node,
   mi_option_max_errors,
diff --git a/src/heap.c b/src/heap.c
index 162cf406..d03925d5 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -149,7 +149,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect regions
   if (collect >= FORCE && _mi_is_main_thread()) {
-    _mi_mem_collect(&heap->tld->stats);
+    _mi_mem_collect(&heap->tld->os);
   }
 }
 
diff --git a/src/init.c b/src/init.c
index ef848de4..971a93c0 100644
--- a/src/init.c
+++ b/src/init.c
@@ -94,11 +94,12 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
+#define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
 
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
+  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
   { 0, tld_main_stats },   // os
   { MI_STATS_NULL }        // stats
 };
@@ -218,6 +219,7 @@ static bool _mi_heap_init(void) {
     memset(tld, 0, sizeof(*tld));
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
     tld->os.stats = &tld->stats;
     _mi_heap_default = heap;
   }
diff --git a/src/memory.c b/src/memory.c
index 75a1df92..e12405c1 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -53,6 +53,9 @@ void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
 void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 
+// local
+static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_t size);
+
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
@@ -470,16 +473,19 @@ Free
 -----------------------------------------------------------------------------*/
 
 // Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
+void _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld != NULL);
   if (p==NULL) return;
   if (size==0) return;
+
+  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+
   size_t arena_memid = 0;
   size_t idx = 0;
   size_t bitidx = 0;
   if (mi_memid_indices(id,&idx,&bitidx,&arena_memid)) {
    // was a direct arena allocation, pass through
-    _mi_arena_free(p, size, arena_memid, stats);
+    _mi_arena_free(p, size, arena_memid, tld->stats);
   }
   else {
     // allocated in a region
@@ -512,14 +518,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
           (mi_option_is_enabled(mi_option_eager_commit) ||  // cannot reset halfway committed segments, use `option_page_reset` instead
             mi_option_is_enabled(mi_option_reset_decommits))) // but we can decommit halfway committed segments
         {
-          _mi_os_reset(p, size, stats);
+          _mi_os_reset(p, size, tld->stats);  // cannot use delay reset! (due to concurrent allocation in the same region)
           //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
         }
       }
     }    
     if (!is_eager_committed) {
       // adjust commit statistics as we commit again when re-using the same slot
-      _mi_stat_decrease(&stats->committed, mi_good_commit_size(size));
+      _mi_stat_decrease(&tld->stats->committed, mi_good_commit_size(size));
     }
 
     // TODO: should we free empty regions? currently only done _mi_mem_collect.
@@ -539,7 +545,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 /* ----------------------------------------------------------------------------
   collection
 -----------------------------------------------------------------------------*/
-void _mi_mem_collect(mi_stats_t* stats) {
+void _mi_mem_collect(mi_os_tld_t* tld) {
   // free every region that has no segments in use.
   for (size_t i = 0; i < regions_count; i++) {
     mem_region_t* region = &regions[i];
@@ -554,7 +560,8 @@ void _mi_mem_collect(mi_stats_t* stats) {
         bool is_eager_committed;
         void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, stats);
+          mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, start, MI_REGION_SIZE);
+          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, tld->stats);
         }
         // and release
         mi_atomic_write(&region->info,0);
@@ -564,25 +571,123 @@ void _mi_mem_collect(mi_stats_t* stats) {
   }
 }
 
+/* ----------------------------------------------------------------------------
+  Delay slots
+-----------------------------------------------------------------------------*/
+
+typedef void (mi_delay_resolve_fun)(void* addr, size_t size, void* arg);
+
+static void mi_delay_insert(mi_delay_slot_t* slots, size_t count,
+  mi_msecs_t delay, uint8_t* addr, size_t size,
+  mi_delay_resolve_fun* resolve, void* arg)
+{
+  if (delay==0) {
+    resolve(addr, size, arg);
+    return;
+  }
+
+  mi_msecs_t now = _mi_clock_now();
+  mi_delay_slot_t* oldest = slots;
+  // walk through all slots, resolving expired ones.
+  // remember the oldest slot to insert the new entry in.
+  for (size_t i = 0; i < count; i++) {
+    mi_delay_slot_t* slot = &slots[i];
+    
+    if (slot->expire == 0) {
+      // empty slot
+      oldest = slot;
+    }
+    // TODO: should we handle overlapping areas too?
+    else if (slot->addr <= addr && slot->addr + slot->size >= addr + size) {
+      // earlier slot encompasses new area, increase expiration
+      slot->expire = now + delay;
+      delay = 0; 
+    }
+    else if (addr <= slot->addr && addr + size >= slot->addr + slot->size) {
+      // new one encompasses old slot, overwrite
+      slot->expire = now + delay;
+      slot->addr = addr;
+      slot->size = size;
+      delay = 0;
+    }
+    else if (slot->expire < now) {
+      // expired slot, resolve now
+      slot->expire = 0;
+      resolve(slot->addr, slot->size, arg);
+    }
+    else if (oldest->expire > slot->expire) {  
+      oldest = slot;
+    }
+  }
+  if (delay>0) {
+    // not yet registered, use the oldest slot
+    if (oldest->expire > 0) { 
+      resolve(oldest->addr, oldest->size, arg);  // evict if not empty
+    }
+    oldest->expire = now + delay;
+    oldest->addr = addr;
+    oldest->size = size;
+  }
+}
+
+static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_t size)
+{
+  uint8_t* addr = (uint8_t*)p;
+  bool done = false;
+  // walk through all slots
+  for (size_t i = 0; i < count; i++) {
+    mi_delay_slot_t* slot = &slots[i];
+    if (slot->addr <= addr && slot->addr + slot->size >= addr + size) {
+      // earlier slot encompasses the area; remove it
+      slot->expire = 0;
+      done = true;
+    }
+    else if (addr <= slot->addr && addr + size >= slot->addr + slot->size) {
+      // new one encompasses old slot, remove it
+      slot->expire = 0;
+    }
+    else if ((addr <= slot->addr && addr + size > slot->addr) ||
+             (addr < slot->addr + slot->size && addr + size >= slot->addr + slot->size)) {
+      // partial overlap, remove slot
+      mi_assert_internal(false); 
+      slot->expire = 0;
+    }
+  }
+  return done;
+}
+
+static void mi_resolve_reset(void* p, size_t size, void* vtld) {
+  mi_os_tld_t* tld = (mi_os_tld_t*)vtld;
+  _mi_os_reset(p, size, tld->stats);
+}
+
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  mi_delay_insert(tld->reset_delay, MI_RESET_DELAY_SLOTS, mi_option_get(mi_option_reset_delay),
+    (uint8_t*)p, size, &mi_resolve_reset, tld);
+  return true;
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  if (!mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, (uint8_t*)p, size)) {
+    return _mi_os_unreset(p, size, is_zero, tld->stats);
+  }
+  return true;
+}
+
+
 
 /* ----------------------------------------------------------------------------
   Other
 -----------------------------------------------------------------------------*/
 
-bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return _mi_os_commit(p, size, is_zero, stats);
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+  return _mi_os_commit(p, size, is_zero, tld->stats);
 }
 
-bool _mi_mem_decommit(void* p, size_t size, mi_stats_t* stats) {
-  return _mi_os_decommit(p, size, stats);
-}
-
-bool _mi_mem_reset(void* p, size_t size, mi_stats_t* stats) {
-  return _mi_os_reset(p, size, stats);
-}
-
-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return _mi_os_unreset(p, size, is_zero, stats);
+bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
+  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+  return _mi_os_decommit(p, size, tld->stats);
 }
 
 bool _mi_mem_protect(void* p, size_t size) {
diff --git a/src/options.c b/src/options.c
index 63b1612a..e098af0b 100644
--- a/src/options.c
+++ b/src/options.c
@@ -70,6 +70,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 256, UNINIT, MI_OPTION(max_numa_node) },     // maximum allowed numa node
   { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
diff --git a/src/segment.c b/src/segment.c
index 178e0eda..b9abe2b3 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -234,7 +234,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
     mi_assert_internal(!segment->mem_is_fixed);
     _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
   }
-  _mi_mem_free(segment, segment_size, segment->memid, tld->stats);
+  _mi_mem_free(segment, segment_size, segment->memid, tld->os);
 }
 
 
@@ -281,7 +281,7 @@ static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld)
   }
   mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
   if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_cache_reset)) {
-    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
+    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->os);
   }
   segment->next = tld->cache;
   tld->cache = segment;
@@ -346,13 +346,13 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
     }
     if (!segment->mem_is_committed && page_kind > MI_PAGE_MEDIUM) {
       mi_assert_internal(!segment->mem_is_fixed);
-      _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->stats);
+      _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->os);
       segment->mem_is_committed = true;
     }
     if (!segment->mem_is_fixed &&
         (mi_option_is_enabled(mi_option_cache_reset) || mi_option_is_enabled(mi_option_page_reset))) {
       bool reset_zero = false;
-      _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->stats);
+      _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->os);
       if (reset_zero) is_zero = true;
     }
   }
@@ -365,7 +365,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
     if (!commit) {
       // ensure the initial info is committed
       bool commit_zero = false;
-      _mi_mem_commit(segment, info_size, &commit_zero, tld->stats);
+      _mi_mem_commit(segment, info_size, &commit_zero, tld->os);
       if (commit_zero) is_zero = true;
     }
     segment->memid = memid;
@@ -459,7 +459,7 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {
   return (segment->used < segment->capacity);
 }
 
-static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats) {
+static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
   mi_assert_expensive(mi_segment_is_valid(segment));
   for (size_t i = 0; i < segment->capacity; i++) {
@@ -472,14 +472,14 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats)
           mi_assert_internal(!segment->mem_is_fixed);
           page->is_committed = true;
           bool is_zero = false;
-          _mi_mem_commit(start,psize,&is_zero,stats);
+          _mi_mem_commit(start,psize,&is_zero,tld->os);
           if (is_zero) page->is_zero_init = true;
         }
         if (page->is_reset) {
           mi_assert_internal(!segment->mem_is_fixed);
           page->is_reset = false;
           bool is_zero = false;
-          _mi_mem_unreset(start, psize, &is_zero, stats);
+          _mi_mem_unreset(start, psize, &is_zero, tld->os);
           if (is_zero) page->is_zero_init = true;
         }
       }
@@ -497,21 +497,20 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats)
 
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
 
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_stats_t* stats) {
-  UNUSED(stats);
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
   mi_assert_internal(page->segment_in_use);
   mi_assert_internal(mi_page_all_free(page));
   mi_assert_internal(page->is_committed);
   size_t inuse = page->capacity * page->block_size;
-  _mi_stat_decrease(&stats->page_committed, inuse);
-  _mi_stat_decrease(&stats->pages, 1);
+  _mi_stat_decrease(&tld->stats->page_committed, inuse);
+  _mi_stat_decrease(&tld->stats->pages, 1);
   
   // reset the page memory to reduce memory pressure?
   if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
     size_t psize;
     uint8_t* start = _mi_page_start(segment, page, &psize);
     page->is_reset = true;
-    _mi_mem_reset(start, psize, stats);
+    _mi_mem_reset(start, psize, tld->os);
   }
 
   // zero the page data, but not the segment fields
@@ -529,7 +528,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
   mi_assert_expensive(mi_segment_is_valid(segment));
 
   // mark it as free now
-  mi_segment_page_clear(segment, page, tld->stats);
+  mi_segment_page_clear(segment, page, tld);
 
   if (segment->used == 0) {
     // no more used pages; remove from the free list and free the segment
@@ -634,7 +633,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
         _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
         if (mi_page_all_free(page)) {
           // if everything free by now, free the page
-          mi_segment_page_clear(segment,page,tld->stats);
+          mi_segment_page_clear(segment,page,tld);
         }
         else {
           // otherwise reclaim it
@@ -666,7 +665,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
 // Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
-  mi_page_t* page = mi_segment_find_free(segment, tld->stats);
+  mi_page_t* page = mi_segment_find_free(segment, tld);
   page->segment_in_use = true;  
   segment->used++;
   mi_assert_internal(segment->used <= segment->capacity);
diff --git a/src/stats.c b/src/stats.c
index 011fab64..cb6d8866 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -206,7 +206,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
   const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, "%10s: %5ld.%ld avg %ld %ld\n", msg, avg_whole, avg_frac1);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }
 
 

From 288726606390edb4ffb9664b9bce0271516b550d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 14:17:36 -0800
Subject: [PATCH 22/48] optimize get numa node for single node systems

---
 src/os.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/os.c b/src/os.c
index 44ef9830..254f85f1 100644
--- a/src/os.c
+++ b/src/os.c
@@ -1046,9 +1046,10 @@ int _mi_os_numa_node_count(void) {
 
 int _mi_os_numa_node(mi_os_tld_t* tld) {
   UNUSED(tld);
-  int numa_node = mi_os_numa_nodex();
-  // never more than the node count and >= 0
   int numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  int numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
   if (numa_node < 0) numa_node = 0;  
   return numa_node;

From 00e19cad9abd225bb4c0975c4f9b6e440a81b97c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 21:37:23 -0800
Subject: [PATCH 23/48] refactor region code, split out atomic bitmap

---
 ide/vs2019/mimalloc-override.vcxproj |   2 +-
 ide/vs2019/mimalloc.vcxproj          |   3 +-
 include/mimalloc-atomic.h            |  31 ++-
 src/bitmap.inc.c                     | 160 +++++++++++++
 src/memory.c                         | 339 ++++++++++-----------------
 5 files changed, 318 insertions(+), 217 deletions(-)
 create mode 100644 src/bitmap.inc.c

diff --git a/ide/vs2019/mimalloc-override.vcxproj b/ide/vs2019/mimalloc-override.vcxproj
index 09fd37fb..e1c7535c 100644
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@@ -123,7 +123,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>Default</CompileAs>
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 1fabff5e..19696c10 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -116,7 +116,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp17</LanguageStandard>
@@ -218,6 +218,7 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index dff0f011..c18f990f 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -36,6 +36,13 @@ static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
 // Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);
 
+// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+
 // Atomically compare and exchange a value; returns `true` if successful. 
 // May fail spuriously. Memory ordering as release on success, and relaxed on failure.
 // (Note: expected and desired are in opposite order from atomic_compare_exchange)
@@ -121,22 +128,28 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
-#define RC64(f)  f##64
+#define MI_64(f) f##64
 #else
 typedef LONG     msc_intptr_t;
-#define RC64(f)  f
+#define MI_64(f) f
 #endif
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+  return (intptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   return mi_atomic_cas_strong(p,desired,expected);
 }
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
-  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
 static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
   return *p;
@@ -177,6 +190,14 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
   MI_USING_STD
   return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_and_explicit(p, x, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_or_explicit(p, x, memory_order_relaxed);
+}
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
   return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
new file mode 100644
index 00000000..5bea4748
--- /dev/null
+++ b/src/bitmap.inc.c
@@ -0,0 +1,160 @@
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// Use bit scan forward to quickly find the first zero bit if it is available
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+
+// An atomic bitmap of `uintptr_t` fields
+typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*           mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// The bit mask for a given number of blocks at a specified bit index.
+static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  return ((((uintptr_t)1 << count) - 1) << bitidx);
+}
+
+// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
+// Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
+{  
+  mi_assert_internal(bitmap_idx != NULL);
+  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map  = mi_atomic_read(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const uintptr_t mask = mi_bitmap_mask_(count, 0);
+  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_BITSCAN
+  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(field);
+        continue;
+      }
+      else {
+        // success, we claimed the bits!        
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_BITSCAN
+      size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+  for (size_t idx = 0; idx < bitmap_fields; idx++) {
+    if (mi_bitmap_try_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+static inline void mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal((bitmap[idx] & mask) == mask);
+  mi_atomic_and(&bitmap[idx], ~mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  return ((prev & mask) == 0);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/memory.c b/src/memory.c
index 75a1df92..29e0e412 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -37,6 +37,8 @@ Possible issues:
 
 #include <string.h>  // memset
 
+#include "bitmap.inc.c"
+
 // Internal raw OS interface
 size_t  _mi_os_large_page_size();
 bool    _mi_os_protect(void* addr, size_t size);
@@ -56,22 +58,22 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 16KiB for the region map
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // 196 bytes for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
-#define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
-#define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
-#define MI_REGION_MAP_FULL        UINTPTR_MAX
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB
+#define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  
 
 
+// Region info is a pointer to the memory region and two bits for 
+// its flags: is_large, and is_committed.
 typedef uintptr_t mi_region_info_t;
 
 static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
@@ -88,19 +90,22 @@ static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, b
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
-  volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
-  volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
+  volatile _Atomic(mi_region_info_t) info;       // start of the memory area (and flags)
   volatile _Atomic(uintptr_t)        numa_node;  // associated numa node + 1 (so 0 is no association)
-  size_t   arena_memid;  // if allocated from a (huge page) arena
+  size_t   arena_memid;                          // if allocated from a (huge page) arena
 } mem_region_t;
 
-
 // The region map; 16KiB for a 256GiB HEAP_REGION_MAX
-// TODO: in the future, maintain a map per NUMA node for numa aware allocation
 static mem_region_t regions[MI_REGION_MAX];
 
-static volatile _Atomic(uintptr_t) regions_count; // = 0;        // allocated regions
+// A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
+static mi_bitmap_field_t regions_map[MI_REGION_MAX];
+
+// A bit mask per region to track which blocks are dirty (= potentially written to)
+static mi_bitmap_field_t regions_dirty[MI_REGION_MAX];
+
+// Allocated regions
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        
 
 
 /* ----------------------------------------------------------------------------
@@ -113,12 +118,6 @@ static size_t mi_region_block_count(size_t size) {
   return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
 }
 
-// The bit mask for a given number of blocks at a specified bit index.
-static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
-  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
-  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
-}
-
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
 static size_t mi_good_commit_size(size_t size) {
   if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
@@ -137,8 +136,8 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 }
 
 
-static size_t mi_memid_create(size_t idx, size_t bitidx) {
-  return ((idx*MI_REGION_MAP_BITS) + bitidx)<<1;
+static size_t mi_memid_create(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx<<1;
 }
 
 static size_t mi_memid_create_from_arena(size_t arena_memid) {
@@ -149,78 +148,57 @@ static bool mi_memid_is_arena(size_t id) {
   return ((id&1)==1);
 }
 
-static bool mi_memid_indices(size_t id, size_t* idx, size_t* bitidx, size_t* arena_memid) {
+static bool mi_memid_indices(size_t id, mi_bitmap_index_t* bitmap_idx, size_t* arena_memid) {
   if (mi_memid_is_arena(id)) {
     *arena_memid = (id>>1);
     return true;
   }
   else {
-    *idx = ((id>>1) / MI_REGION_MAP_BITS);
-    *bitidx = ((id>>1) % MI_REGION_MAP_BITS);
+    *bitmap_idx = (mi_bitmap_index_t)(id>>1);
     return false;
   }
 }
 
 /* ----------------------------------------------------------------------------
-Commit from a region
+  Ensure a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
 
-// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, 
-                                    size_t size, bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
+static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_info_t* pinfo, mi_os_tld_t* tld)
 {
-  size_t mask = mi_region_block_mask(blocks,bitidx);
-  mi_assert_internal(mask != 0);
-  mi_assert_internal((mask & mi_atomic_read_relaxed(&region->map)) == mask);
-  mi_assert_internal(&regions[idx] == region);
-
   // ensure the region is reserved
-  mi_region_info_t info = mi_atomic_read(&region->info);
-  if (info == 0) 
+  mi_region_info_t info = mi_atomic_read(&regions[idx].info);
+  if (mi_unlikely(info == 0))
   {
     bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
-    bool region_large  = *allow_large;
+    bool region_large = allow_large;
+    bool is_zero = false;
     size_t arena_memid = 0;
-    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, is_zero, &arena_memid, tld);
-    /*
-    void* start = NULL;
-    if (region_large) {
-      start = _mi_os_try_alloc_from_huge_reserved(MI_REGION_SIZE, MI_SEGMENT_ALIGN);
-      if (start != NULL) { region_commit = true; }
-    }
-    if (start == NULL) {
-      start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, region_commit, &region_large, tld);
-    }
-    */
-    mi_assert_internal(!(region_large && !*allow_large));
+    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+    mi_assert_internal(!(region_large && !allow_large));
 
     if (start == NULL) {
-      // failure to allocate from the OS! unclaim the blocks and fail
-      size_t map;
-      do {
-        map = mi_atomic_read_relaxed(&region->map);
-      } while (!mi_atomic_cas_weak(&region->map, map & ~mask, map));
+      // failure to allocate from the OS! fail
+      *pinfo = 0;
       return false;
     }
 
     // set the newly allocated region
-    info = mi_region_info_create(start,region_large,region_commit);
-    if (mi_atomic_cas_strong(&region->info, info, 0)) {
+    info = mi_region_info_create(start, region_large, region_commit);
+    if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) {
       // update the region count
-      region->arena_memid = arena_memid;
-      mi_atomic_write(&region->numa_node, _mi_os_numa_node(tld) + 1);
+      regions[idx].arena_memid = arena_memid;
+      mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1);
+      mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
       mi_atomic_increment(&regions_count);
     }
     else {
       // failed, another thread allocated just before us!
       // we assign it to a later slot instead (up to 4 tries).
-      for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
+      for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
           regions[idx+i].arena_memid = arena_memid;
           mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
+          mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -232,27 +210,33 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
         // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
       }
       // and continue with the memory at our index
-      info = mi_atomic_read(&region->info);
+      info = mi_atomic_read(&regions[idx].info);
     }
   }
-  mi_assert_internal(info == mi_atomic_read(&region->info));
+  mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
   mi_assert_internal(info != 0);
+  *pinfo = info;
+  return true;
+}
+
+
+/* ----------------------------------------------------------------------------
+  Commit blocks
+-----------------------------------------------------------------------------*/
+
+static void* mi_region_commit_blocks(mi_bitmap_index_t bitmap_idx, mi_region_info_t info, size_t blocks, size_t size, bool* commit, bool* is_large, bool* is_zero, mi_os_tld_t* tld)
+{
+  // set dirty bits
+  *is_zero = mi_bitmap_claim(regions_dirty, MI_REGION_MAX, blocks, bitmap_idx);
 
   // Commit the blocks to memory
   bool region_is_committed = false;
   bool region_is_large = false;
-  void* start = mi_region_info_read(info,&region_is_large,&region_is_committed);  
-  mi_assert_internal(!(region_is_large && !*allow_large));
+  void* start = mi_region_info_read(info, &region_is_large, &region_is_committed);
+  mi_assert_internal(!(region_is_large && !*is_large));
   mi_assert_internal(start!=NULL);
 
-  // set dirty bits
-  uintptr_t m;
-  do {
-    m = mi_atomic_read(&region->dirty_mask);
-  } while (!mi_atomic_cas_weak(&region->dirty_mask, m | mask, m));
-  *is_zero = ((m & mask) == 0); // no dirty bit set in our claimed range?
-
-  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+  void* blocks_start = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bitmap_idx) * MI_SEGMENT_SIZE);
   if (*commit && !region_is_committed) {
     // ensure commit 
     bool commit_zero = false;
@@ -266,99 +250,58 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
 
   // and return the allocation  
   mi_assert_internal(blocks_start != NULL);
-  *allow_large = region_is_large;
-  *p  = blocks_start;
-  *id = mi_memid_create(idx, bitidx); 
+  *is_large = region_is_large;
+  return blocks_start;
+}
+
+/* ----------------------------------------------------------------------------
+  Claim and allocate blocks in a region
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_alloc_blocks(
+  size_t idx, size_t blocks, size_t size,
+  bool* commit, bool* allow_large, bool* is_zero,
+  void** p, size_t* id, mi_os_tld_t* tld)
+{
+  mi_bitmap_index_t bitmap_idx;
+  if (!mi_bitmap_try_claim_field(regions_map, idx, blocks, &bitmap_idx)) {
+    return true; // no error, but also no success
+  }
+  mi_region_info_t info;
+  if (!mi_region_ensure_allocated(idx,*allow_large,&info,tld)) {
+    // failed to allocate region memory, unclaim the bits and fail
+    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
+    return false;
+  }
+  *p = mi_region_commit_blocks(bitmap_idx,info,blocks,size,commit,allow_large,is_zero,tld);
+  *id = mi_memid_create(bitmap_idx);
   return true;
 }
 
-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanForward64(&idx, x);
-  #else
-  _BitScanForward(&idx, x);
-  #endif
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanReverse64(&idx, x);
-  #else
-  _BitScanReverse(&idx, x);
-  #endif
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x));
-}
-#endif
 
-// Allocate `blocks` in a `region` at `idx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, 
-                                   bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
-{
-  mi_assert_internal(p != NULL && id != NULL);
-  mi_assert_internal(blocks < MI_REGION_MAP_BITS);
+/* ----------------------------------------------------------------------------
+  Try to allocate blocks in suitable regions
+-----------------------------------------------------------------------------*/
 
-  const uintptr_t mask = mi_region_block_mask(blocks, 0);
-  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  uintptr_t map = mi_atomic_read(&region->map);
-  if (map==MI_REGION_MAP_FULL) return true;
-
-  #ifdef MI_HAVE_BITSCAN
-  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
-  #else
-  size_t bitidx = 0;               // otherwise start at 0
-  #endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while(bitidx <= bitidx_max) {
-    if ((map & m) == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(&region->map, newmap, map)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(&region->map);
-        continue;
-      }
-      else {
-        // success, we claimed the bits
-        // now commit the block memory -- this can still fail
-        return mi_region_commit_blocks(region, idx, bitidx, blocks, 
-                                       size, commit, allow_large, is_zero, p, id, tld);
-      }
-    }
-    else {
-      // on to the next bit range
-      #ifdef MI_HAVE_BITSCAN
-      size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
-      mi_assert_internal(shift > 0 && shift <= blocks);
-      #else
-      size_t shift = 1;
-      #endif
-      bitidx += shift;
-      m <<= shift;
-    }
+static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool allow_large ) {
+  uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
+  if (m == MI_BITMAP_FIELD_FULL) return false;
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
+    if (rnode != numa_node) return false;
+  }
+  if (mi_unlikely(!(commit || allow_large))) {
+    // otherwise skip incompatible regions if possible. 
+    // this is not guaranteed due to multiple threads allocating at the same time but
+    // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+    // otherwise we might just not be able to reset/decommit individual pages sometimes.
+    mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+    bool is_large;
+    bool is_committed;
+    void* start = mi_region_info_read(info, &is_large, &is_committed);
+    bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
+    if (!ok) return false;
   }
-  // no error, but also no bits found
   return true;
 }
 
@@ -366,33 +309,15 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(int numa_node, size_t idx, size_t blocks, size_t size,
+static bool mi_region_try_alloc_blocks(
+  int numa_node, size_t idx, size_t blocks, size_t size,
   bool* commit, bool* allow_large, bool* is_zero,
   void** p, size_t* id, mi_os_tld_t* tld)
 {
   // check if there are available blocks in the region..
   mi_assert_internal(idx < MI_REGION_MAX);
-  mem_region_t* region = &regions[idx];
-  uintptr_t m = mi_atomic_read_relaxed(&region->map);
-  int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
-  if ((rnode < 0 || rnode == numa_node) &&  // fits current numa node
-      (m != MI_REGION_MAP_FULL))            // and some bits are zero    
-  {
-    bool ok = (*commit || *allow_large); // committing or allow-large is always ok
-    if (!ok) {
-      // otherwise skip incompatible regions if possible. 
-      // this is not guaranteed due to multiple threads allocating at the same time but
-      // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-      // otherwise we might just not be able to reset/decommit individual pages sometimes.
-      mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
-      bool is_large;
-      bool is_committed;
-      void* start = mi_region_info_read(info,&is_large,&is_committed);
-      ok = (start == NULL || (*commit || !is_committed) || (*allow_large || !is_large)); // Todo: test with one bitmap operation?
-    }
-    if (ok) {
-      return mi_region_alloc_blocks(region, idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
-    }
+  if (mi_region_is_suitable(numa_node, idx, *commit, *allow_large)) {
+    return mi_region_alloc_blocks(idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
   }
   return true;  // no error, but no success either
 }
@@ -426,14 +351,14 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   size = _mi_align_up(size, _mi_os_page_size());
 
   // calculate the number of needed blocks
-  size_t blocks = mi_region_block_count(size);
+  const size_t blocks = mi_region_block_count(size);
   mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
 
   // find a range of free blocks
-  int numa_node = _mi_os_numa_node(tld);
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
   void* p = NULL;
-  size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
     if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
@@ -456,7 +381,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
     *id = mi_memid_create_from_arena(arena_memid);
   }
   else {
-    tld->region_idx = idx;  // next start of search? currently not used as we use first-fit
+    tld->region_idx = idx;  // next start of search
   }
 
   mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
@@ -475,9 +400,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
   if (p==NULL) return;
   if (size==0) return;
   size_t arena_memid = 0;
-  size_t idx = 0;
-  size_t bitidx = 0;
-  if (mi_memid_indices(id,&idx,&bitidx,&arena_memid)) {
+  mi_bitmap_index_t bitmap_idx;
+  if (mi_memid_indices(id,&bitmap_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
     _mi_arena_free(p, size, arena_memid, stats);
   }
@@ -487,11 +411,11 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
     size = _mi_align_up(size, _mi_os_page_size());    
-    size_t blocks = mi_region_block_count(size);
-    size_t mask = mi_region_block_mask(blocks, bitidx);
+    const size_t blocks = mi_region_block_count(size);
+    const size_t idx    = mi_bitmap_index_field(bitmap_idx);
+    const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
     mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
     mem_region_t* region = &regions[idx];
-    mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
     mi_region_info_t info = mi_atomic_read(&region->info);
     bool is_large;
     bool is_eager_committed;
@@ -499,8 +423,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     mi_assert_internal(start != NULL);
     void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
-    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
+    mi_assert_internal(bitidx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bitidx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
 
     // decommit (or reset) the blocks to reduce the working set.
     // TODO: implement delayed decommit/reset as these calls are too expensive
@@ -526,12 +450,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // this frees up virtual address space which might be useful on 32-bit systems?
 
     // and unclaim
-    uintptr_t map;
-    uintptr_t newmap;
-    do {
-      map = mi_atomic_read_relaxed(&region->map);
-      newmap = map & ~mask;
-    } while (!mi_atomic_cas_weak(&region->map, newmap, map));
+    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
   }
 }
 
@@ -542,23 +461,23 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 void _mi_mem_collect(mi_stats_t* stats) {
   // free every region that has no segments in use.
   for (size_t i = 0; i < regions_count; i++) {
-    mem_region_t* region = &regions[i];
-    if (mi_atomic_read_relaxed(&region->map) == 0) {
+    if (mi_atomic_read_relaxed(&regions_map[i]) == 0) {
       // if no segments used, try to claim the whole region
       uintptr_t m;
       do {
-        m = mi_atomic_read_relaxed(&region->map);
-      } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
+        m = mi_atomic_read_relaxed(&regions_map[i]);
+      } while(m == 0 && !mi_atomic_cas_weak(&regions_map[i], MI_BITMAP_FIELD_FULL, 0 ));
       if (m == 0) {
         // on success, free the whole region
         bool is_eager_committed;
-        void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
+        void* start = mi_region_info_read(mi_atomic_read(&regions[i].info), NULL, &is_eager_committed);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, stats);
+          _mi_arena_free(start, MI_REGION_SIZE, regions[i].arena_memid, stats);
         }
         // and release
-        mi_atomic_write(&region->info,0);
-        mi_atomic_write(&region->map,0);
+        mi_atomic_write(&regions[i].info,0);
+        mi_atomic_write(&regions_dirty[i],0);
+        mi_atomic_write(&regions_map[i],0);
       }
     }
   }

From b09282bc0d6e3228c556eac833331438dbe774be Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 22:49:01 -0800
Subject: [PATCH 24/48] change arena allocator to atomic bitmap as well

---
 include/mimalloc.h |   4 +-
 src/arena.c        | 268 +++++++++++++--------------------------------
 src/bitmap.inc.c   |   6 +-
 src/init.c         |   4 +-
 src/os.c           |  20 ++--
 5 files changed, 94 insertions(+), 208 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index c03ddc1e..70b6e412 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -230,8 +230,8 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
 
-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept;
-mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
 // deprecated
 mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
diff --git a/src/arena.c b/src/arena.c
index e58d2c47..b807cd47 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -7,15 +7,19 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 16MiB). Currently only used to
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to
 allocate in one arena consisting of huge OS pages -- otherwise it 
 delegates to direct allocation from the OS.
 
 In the future, we can expose an API to manually add more arenas which
 is sometimes needed for embedded devices or shared memory for example.
 
-The arena allocation needs to be thread safe and we use a lock-free scan
-with on-demand coalescing.
+The arena allocation needs to be thread safe and we use an atomic
+bitmap to allocate. The current implementation of the bitmap can
+only do this within a field (`uintptr_t`) so we can allocate at most
+blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
+can lead to fragmentation but fortunately most objects will be regions
+of 256MiB in practice.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
@@ -23,6 +27,8 @@ with on-demand coalescing.
 
 #include <string.h>  // memset
 
+#include "bitmap.inc.c"  // atomic bitmap
+
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
@@ -36,9 +42,11 @@ int   _mi_os_numa_node_count(void);
   Arena allocation
 ----------------------------------------------------------- */
 
-#define MI_SEGMENT_ALIGN     MI_SEGMENT_SIZE
-#define MI_ARENA_BLOCK_SIZE  (4*MI_SEGMENT_ALIGN)  // 16MiB
-#define MI_MAX_ARENAS        (64)
+#define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
+#define MI_ARENA_BLOCK_SIZE   (8*MI_SEGMENT_ALIGN)     // 32MiB
+#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_FIELD_BITS * MI_ARENA_BLOCK_SIZE)  // 2GiB
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
 
 // Block info: bit 0 contains the `in_use` bit, the upper bits the
 // size in count of arena blocks.
@@ -48,11 +56,13 @@ typedef uintptr_t mi_block_info_t;
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields
   int      numa_node;                     // associated NUMA node
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_large;                      // large OS page allocated
-  _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
-  _Atomic(mi_block_info_t) blocks[1];     // `block_count` block info's
+  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];        // bitmap of in-use blocks 
 } mi_arena_t;
 
 
@@ -69,180 +79,55 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0
 // Use `0` as a special id for direct OS allocated memory.
 #define MI_MEMID_OS   0
 
-static size_t mi_memid_create(size_t arena_index, size_t block_index) {
+static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
   mi_assert_internal(arena_index < 0xFE);
-  return ((block_index << 8) | ((arena_index+1) & 0xFF));
+  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }
 
-static void mi_memid_indices(size_t memid, size_t* arena_index, size_t* block_index) {
+static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
   mi_assert_internal(memid != MI_MEMID_OS);
   *arena_index = (memid & 0xFF) - 1;
-  *block_index = (memid >> 8);
+  *bitmap_index = (memid >> 8);
 }
 
-/* -----------------------------------------------------------
-  Block info
------------------------------------------------------------ */
 
-static bool mi_block_is_in_use(mi_block_info_t info) {
-  return ((info&1) != 0);
+static size_t mi_arena_block_count_of_size(size_t size) {
+  const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
+  const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
+  return bcount;
 }
 
-static size_t mi_block_count(mi_block_info_t info) {
-  return (info>>1);
-}
-
-static mi_block_info_t mi_block_info_create(size_t bcount, bool in_use) {
-  return (((mi_block_info_t)bcount << 1) | (in_use ? 1 : 0));
-}
-
-
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-
-static void* mi_arena_allocx(mi_arena_t* arena, size_t start_idx, size_t end_idx, size_t needed_bcount, bool* is_zero, size_t* block_index)
+static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) 
 {
-  // Scan linearly through all block info's
-  // Skipping used ranges, coalescing free ranges on demand.
-  mi_assert_internal(needed_bcount > 0);
-  mi_assert_internal(start_idx <= arena->block_count);
-  mi_assert_internal(end_idx <= arena->block_count);
-  _Atomic(mi_block_info_t)* block = &arena->blocks[start_idx];
-  _Atomic(mi_block_info_t)* end = &arena->blocks[end_idx];
-  while (block < end) {
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    size_t bcount = mi_block_count(binfo);
-    if (mi_block_is_in_use(binfo)) {
-      // in-use, skip ahead
-      mi_assert_internal(bcount > 0);
-      block += bcount;
-    }
-    else {
-      // free blocks
-      if (bcount==0) {
-        // optimization:
-        // use 0 initialized blocks at the end, to use single atomic operation
-        // initially to reduce contention (as we don't need to split)
-        if (block + needed_bcount > end) {
-          return NULL; // does not fit
-        }
-        else if (!mi_atomic_cas_weak(block, mi_block_info_create(needed_bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // we got it: return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = arena->is_zero_init;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
-
-      mi_assert_internal(bcount>0);
-      if (needed_bcount > bcount) {
-#if 0 // MI_NO_ARENA_COALESCE
-        block += bcount; // too small, skip to the next range
-        continue;
-#else
-        // too small, try to coalesce
-        _Atomic(mi_block_info_t)* block_next = block + bcount;
-        if (block_next >= end) {
-          return NULL; // does not fit
-        }
-        mi_block_info_t binfo_next = mi_atomic_read(block_next);
-        size_t bcount_next = mi_block_count(binfo_next);
-        if (mi_block_is_in_use(binfo_next)) {
-          // next block is in use, cannot coalesce
-          block += (bcount + bcount_next); // skip ahea over both blocks
-        }
-        else {
-          // next block is free, try to coalesce
-          // first set the next one to being used to prevent dangling ranges
-          if (!mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, true), binfo_next)) {
-            // someone else got in before us.. try again
-            continue;
-          }
-          else {
-            if (!mi_atomic_cas_strong(block, mi_block_info_create(bcount + bcount_next, true), binfo)) {  // use strong to increase success chance
-              // someone claimed/coalesced the block in the meantime
-              // first free the next block again..
-              bool ok = mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, false), binfo_next); // must be strong
-              mi_assert(ok); UNUSED(ok);
-              // and try again
-              continue;
-            }
-            else {
-              // coalesced! try again
-              // todo: we could optimize here to immediately claim the block if the
-              // coalesced size is a fit instead of retrying. Keep it simple for now.
-              continue;
-            }
-          }
-        }
-#endif
-      }
-      else {  // needed_bcount <= bcount
-        mi_assert_internal(needed_bcount <= bcount);
-        // it fits, claim the whole block
-        if (!mi_atomic_cas_weak(block, mi_block_info_create(bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // got it, now split off the needed part
-          if (needed_bcount < bcount) {
-            mi_atomic_write(block + needed_bcount, mi_block_info_create(bcount - needed_bcount, false));
-            mi_atomic_write(block, mi_block_info_create(needed_bcount, true));
-          }
-          // return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = false;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
+  const size_t fcount = arena->field_count;
+  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
+  for (size_t visited = 0; visited < fcount; visited++, idx++) {
+    if (idx >= fcount) idx = 0;  // wrap around
+    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
+      // claimed it! set the dirty bits
+      *is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
+      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
+      return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE);
     }
   }
-  // no success
   return NULL;
 }
 
-// Try to reduce search time by starting from bottom and wrap around.
-static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_zero, size_t* block_index)
-{
-  uintptr_t bottom = mi_atomic_read_relaxed(&arena->block_bottom);
-  void* p = mi_arena_allocx(arena, bottom, arena->block_count, needed_bcount, is_zero, block_index);
-  if (p == NULL && bottom > 0) {
-    // try again from the start
-    p = mi_arena_allocx(arena, 0, bottom, needed_bcount, is_zero, block_index);
-  }
-  if (p != NULL) {
-    mi_atomic_write(&arena->block_bottom, *block_index);
-  }
-  return p;
-}
 
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
 
 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
-                                    bool* commit, bool* large, bool* is_zero,
-                                    size_t* memid) 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
-  size_t block_index = SIZE_MAX;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
+  mi_bitmap_index_t bitmap_index;
+  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index);
   if (p != NULL) {
-    mi_assert_internal(block_index != SIZE_MAX);
-    #if MI_DEBUG>=1
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
-    mi_block_info_t binfo = mi_atomic_read(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
-    #endif
-    *memid = mi_memid_create(arena_index, block_index);
+    *memid = mi_memid_create(arena_index, bitmap_index);
     *commit = true;           // TODO: support commit on demand?
     *large = arena->is_large;
   }
@@ -261,15 +146,13 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
   if (large==NULL) large = &default_large;  // ensure `large != NULL`
 
   // try to allocate in an arena if the alignment is small enough
-  // and if there is not too much waste around the `MI_ARENA_BLOCK_SIZE`.
-  if (alignment <= MI_SEGMENT_ALIGN &&
-      size >= 3*(MI_ARENA_BLOCK_SIZE/4) &&  // > 12MiB (not more than 25% waste)
-      !(size > MI_ARENA_BLOCK_SIZE && size < 3*(MI_ARENA_BLOCK_SIZE/2)) // ! <16MiB - 24MiB>
-     )
+  // and the object is not too large or too small.
+  if (alignment <= MI_SEGMENT_ALIGN && 
+      size <= MI_ARENA_MAX_OBJ_SIZE && 
+      size >= MI_ARENA_MIN_OBJ_SIZE)
   {
-    size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-    size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-    int numa_node = _mi_os_numa_node(tld); // current numa node
+    const size_t bcount = mi_arena_block_count_of_size(size);
+    const int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
     // try numa affine allocation
@@ -324,8 +207,8 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
   else {
     // allocated in an arena
     size_t arena_idx;
-    size_t block_idx;
-    mi_memid_indices(memid, &arena_idx, &block_idx);
+    size_t bitmap_idx;
+    mi_memid_indices(memid, &arena_idx, &bitmap_idx);
     mi_assert_internal(arena_idx < MI_MAX_ARENAS);
     mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
     mi_assert_internal(arena != NULL);
@@ -333,27 +216,17 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
       _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    mi_assert_internal(arena->block_count > block_idx);
-    if (arena->block_count <= block_idx) {
-      _mi_fatal_error("trying to free from non-existent block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_idx];
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
-    if (!mi_block_is_in_use(binfo)) {
+    const size_t blocks = mi_arena_block_count_of_size(size);
+    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
+    if (!ones) {
       _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
       return;
     };
-    bool ok = mi_atomic_cas_strong(block, mi_block_info_create(mi_block_count(binfo), false), binfo);
-    mi_assert_internal(ok);
-    if (!ok) {
-      _mi_warning_message("unable to free arena block: %p, info 0x%zx", p, binfo);
-    }
-    if (block_idx < mi_atomic_read_relaxed(&arena->block_bottom)) {
-      mi_atomic_write(&arena->block_bottom, block_idx);
-    }
   }
 }
 
@@ -365,8 +238,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
-  mi_assert_internal(mi_mem_is_zero(arena->blocks,arena->block_count*sizeof(mi_block_info_t)));
-
+  
   uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
   if (i >= MI_MAX_ARENAS) {
     mi_atomic_subu(&mi_arena_count, 1);
@@ -383,40 +255,49 @@ static bool mi_arena_add(mi_arena_t* arena) {
 #include <errno.h> // ENOMEM
 
 // reserve at a specific numa node
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
   size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, pages*500, &pages_reserved, &hsize);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
   if (p==NULL || pages_reserved==0) {
     _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
     return ENOMEM;
   }
   _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
   
-  size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
-  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
+  size_t bcount = mi_arena_block_count_of_size(hsize);
+  size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
+  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
   mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) {
     _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
     return ENOMEM;
   }
   arena->block_count = bcount;
-  arena->start = (uint8_t*)p;
-  arena->block_bottom = 0;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)p;  
   arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large = true;
   arena->is_zero_init = true;
-  memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
+  arena->search_idx = 0;
+  arena->blocks_dirty = &arena->blocks_map[bcount];
+  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    mi_bitmap_claim(arena->blocks_map, fields, post, postidx); 
+  }
+  
   mi_arena_add(arena);
   return 0;
 }
 
 
 // reserve huge pages evenly among all numa nodes. 
-int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept {
   if (pages == 0) return 0;
 
   // pages per numa node
@@ -424,12 +305,13 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
   if (numa_count <= 0) numa_count = 1;
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs / numa_count) + 50;
   
   // reserve evenly among numa nodes
   for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
     if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node);
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
@@ -446,7 +328,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   UNUSED(max_secs);
   _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages);  
+  int err = mi_reserve_huge_os_pages_interleave(pages, (size_t)(max_secs * 1000.0));  
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 5bea4748..aeb185d1 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -135,13 +135,15 @@ static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields,
 }
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
-static inline void mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+// Returns `true` if all `count` bits were 1 previously
+static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   mi_assert_internal((bitmap[idx] & mask) == mask);
-  mi_atomic_and(&bitmap[idx], ~mask);
+  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
 }
 
 
diff --git a/src/init.c b/src/init.c
index ef848de4..f6d253f9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -433,8 +433,8 @@ static void mi_process_load(void) {
   }
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);    
-    mi_reserve_huge_os_pages_interleave(pages);
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);    
+    mi_reserve_huge_os_pages_interleave(pages, pages*500);
   }
 }
 
diff --git a/src/os.c b/src/os.c
index 254f85f1..027df6ab 100644
--- a/src/os.c
+++ b/src/os.c
@@ -940,16 +940,18 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
     
     // check for timeout
-    mi_msecs_t elapsed = _mi_clock_end(start_t);
-    if (page >= 1) {
-      mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
-      if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
-        elapsed = max_msecs + 1; 
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
       }
-    }
-    if (elapsed > max_msecs) {
-      _mi_warning_message("huge page allocation timed out\n");
-      break;
     }
   }
   mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);

From 378716c46724d839411166a0bba68b0722cf9d8b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:26:52 -0800
Subject: [PATCH 25/48] refactor and improve atomic bitmap usage

---
 CMakeLists.txt                       |  12 ++-
 ide/vs2019/mimalloc-override.vcxproj |   3 +
 ide/vs2019/mimalloc.vcxproj          |   4 +-
 include/mimalloc-internal.h          |  11 ++-
 include/mimalloc-types.h             |  10 +--
 src/arena.c                          |  62 +++++++--------
 src/bitmap.inc.c                     | 110 ++++++++++++++++++---------
 src/memory.c                         |  96 +++++++++++------------
 src/page.c                           |   2 +
 test/test-stress.c                   |   4 +-
 10 files changed, 183 insertions(+), 131 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12540f68..0726c601 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SECURE_FULL       "Use full security mitigations (like double free protection, more expensive)" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 
@@ -70,9 +71,14 @@ if(MI_OVERRIDE MATCHES "ON")
   endif()
 endif()
 
-if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=3)
+if(MI_SECURE_FULL MATCHES "ON")
+  message(STATUS "Set full secure build (experimental) (MI_SECURE_FULL=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
+else()
+  if(MI_SECURE MATCHES "ON")
+    message(STATUS "Set secure build (MI_SECURE=ON)")
+    list(APPEND mi_defines MI_SECURE=3)
+  endif()
 endif()
 
 if(MI_SEE_ASM MATCHES "ON")
diff --git a/ide/vs2019/mimalloc-override.vcxproj b/ide/vs2019/mimalloc-override.vcxproj
index e1c7535c..49f3d213 100644
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@@ -232,6 +232,9 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 19696c10..bae49bab 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -218,7 +218,9 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.inc.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 413f76e6..4d8b6a77 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -163,7 +163,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 
 
 // Overflow detecting multiply
-#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
@@ -175,6 +174,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
   return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
   *total = count * size;
   return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
           && size > 0 && (SIZE_MAX / size) < count);
@@ -188,6 +188,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {
 
 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
   uintptr_t mask = alignment - 1;
   if ((alignment & mask) == 0) {  // power of two?
     return ((sz + mask) & ~mask);
@@ -197,6 +198,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
@@ -283,7 +290,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
   // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
   ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
   uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
   mi_assert_internal(idx < segment->capacity);
   mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..ced8e7a9 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 4  // experimental, may be more expensive: checks for double free.
 
 #if !defined(MI_SECURE)
-#define MI_SECURE 0
+#define MI_SECURE 4
 #endif
 
 // Define MI_DEBUG for debug mode
@@ -93,12 +93,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb
 
 // Derived constants
-#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)
 
-#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)
 
 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
diff --git a/src/arena.c b/src/arena.c
index b807cd47..8feec89f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -7,12 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to
-allocate in one arena consisting of huge OS pages -- otherwise it 
-delegates to direct allocation from the OS.
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). 
+In contrast to the rest of mimalloc, the arenas are shared between 
+threads and need to be accessed using atomic operations.
 
-In the future, we can expose an API to manually add more arenas which
-is sometimes needed for embedded devices or shared memory for example.
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas 
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
 
 The arena allocation needs to be thread safe and we use an atomic
 bitmap to allocate. The current implementation of the bitmap can
@@ -48,10 +52,6 @@ int   _mi_os_numa_node_count(void);
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
 #define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
 
-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
@@ -61,8 +61,8 @@ typedef struct mi_arena_s {
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_large;                      // large OS page allocated
   volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t  blocks_map[1];        // bitmap of in-use blocks 
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
 } mi_arena_t;
 
 
@@ -81,6 +81,7 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0
 
 static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
   mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
   return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }
 
@@ -90,30 +91,25 @@ static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_
   *bitmap_index = (memid >> 8);
 }
 
-
-static size_t mi_arena_block_count_of_size(size_t size) {
-  const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-  const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-  return bcount;
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) 
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) 
 {
   const size_t fcount = arena->field_count;
   size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
   for (size_t visited = 0; visited < fcount; visited++, idx++) {
     if (idx >= fcount) idx = 0;  // wrap around
     if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
-      // claimed it! set the dirty bits
-      *is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
       mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
-      return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE);
+      return true;
     }
   }
-  return NULL;
+  return false;
 }
 
 
@@ -125,13 +121,15 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
                                  bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
   mi_bitmap_index_t bitmap_index;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index);
-  if (p != NULL) {
-    *memid = mi_memid_create(arena_index, bitmap_index);
-    *commit = true;           // TODO: support commit on demand?
-    *large = arena->is_large;
+  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
+    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
+    *memid   = mi_memid_create(arena_index, bitmap_index);
+    *commit  = true;           // TODO: support commit on demand?
+    *large   = arena->is_large;
+    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
   }
-  return p;
+  return NULL;
 }
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
@@ -140,7 +138,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 {
   mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
   *is_zero = false;
   bool default_large = false;
   if (large==NULL) large = &default_large;  // ensure `large != NULL`
@@ -151,7 +149,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
       size <= MI_ARENA_MAX_OBJ_SIZE && 
       size >= MI_ARENA_MIN_OBJ_SIZE)
   {
-    const size_t bcount = mi_arena_block_count_of_size(size);
+    const size_t bcount = mi_block_count_of_size(size);
     const int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
@@ -221,7 +219,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
       _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    const size_t blocks = mi_arena_block_count_of_size(size);
+    const size_t blocks = mi_block_count_of_size(size);
     bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
     if (!ones) {
       _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
@@ -268,7 +266,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   }
   _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
   
-  size_t bcount = mi_arena_block_count_of_size(hsize);
+  size_t bcount = mi_block_count_of_size(hsize);
   size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
   size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
   mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
@@ -284,6 +282,8 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   arena->is_zero_init = true;
   arena->search_idx = 0;
   arena->blocks_dirty = &arena->blocks_map[bcount];
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
   size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   if (post > 0) {
     // don't use leftover bits at the end
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index aeb185d1..19e6bbb8 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -1,41 +1,30 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This file is meant to be included in other files for efficiency.
+It implements a bitmap that can set/reset sequences of bits atomically
+and is used to concurrently claim memory ranges. 
+
+A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
+
+A current limitation is that the bit sequences cannot cross fields 
+and that the sequence must be smaller or equal to the bits in a field.
+---------------------------------------------------------------------------- */
 #pragma once
-#ifndef MI_BITMAP_H
-#define MI_BITMAP_H
+#ifndef MI_BITMAP_C
+#define MI_BITMAP_C
 
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 
-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanForward)(&idx, x);
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanReverse)(&idx, x);
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-#if (INTPTR_MAX == LONG_MAX)
-# define MI_L(x)  x##l
-#else
-# define MI_L(x)  x##ll
-#endif
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
-}
-#endif
-
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
 
 #define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
 #define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
@@ -63,14 +52,59 @@ static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx)
   return (bitmap_idx % MI_BITMAP_FIELD_BITS);
 }
 
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
   return ((((uintptr_t)1 << count) - 1) << bitidx);
 }
 
-// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
-// Returns `true` on success.
+
+/* -----------------------------------------------------------
+  Use bit scan forward/reverse to quickly find the first zero bit if it is available
+----------------------------------------------------------- */
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#include <limits.h> // LONG_MAX
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single 
+// field at `idx` in `bitmap`. Returns `true` on success.
 static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
 {  
   mi_assert_internal(bitmap_idx != NULL);
@@ -93,7 +127,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
   while (bitidx <= bitidx_max) {
     if ((map & m) == 0) {  // are the mask bits free at bitidx?
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
+      const uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
       if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
         // no success, another thread claimed concurrently.. keep going
@@ -109,10 +143,10 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
     else {
       // on to the next bit range
 #ifdef MI_HAVE_BITSCAN
-      size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
       mi_assert_internal(shift > 0 && shift <= count);
 #else
-      size_t shift = 1;
+      const size_t shift = 1;
 #endif
       bitidx += shift;
       m <<= shift;
diff --git a/src/memory.c b/src/memory.c
index 29e0e412..bdbf1e48 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -16,10 +16,10 @@ We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
    to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses in that
-   object :-( (`malloc-large` tests this). This means we need a cheaper way to
-   reuse memory.
-3. This layer can help with a NUMA aware allocation in the future.
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.
 
 Possible issues:
 - (2) can potentially be addressed too with a small cache per thread which is much
@@ -47,8 +47,6 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-//void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-//void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
 
 // arena.c
 void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
@@ -58,18 +56,18 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 40KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
 #define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
 
 
 // Region info is a pointer to the memory region and two bits for 
@@ -95,7 +93,7 @@ typedef struct mem_region_s {
   size_t   arena_memid;                          // if allocated from a (huge page) arena
 } mem_region_t;
 
-// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
+// The region map
 static mem_region_t regions[MI_REGION_MAX];
 
 // A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
@@ -173,7 +171,7 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
     bool region_large = allow_large;
     bool is_zero = false;
     size_t arena_memid = 0;
-    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+    void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
     mi_assert_internal(!(region_large && !allow_large));
 
     if (start == NULL) {
@@ -183,35 +181,31 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
     }
 
     // set the newly allocated region
+    // try to initialize any region up to 4 beyond the current one in
+    // care multiple threads are doing this concurrently (common at startup)    
     info = mi_region_info_create(start, region_large, region_commit);
-    if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) {
-      // update the region count
-      regions[idx].arena_memid = arena_memid;
-      mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1);
-      mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-      mi_atomic_increment(&regions_count);
-    }
-    else {
-      // failed, another thread allocated just before us!
-      // we assign it to a later slot instead (up to 4 tries).
-      for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-          regions[idx+i].arena_memid = arena_memid;
-          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
-          mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-          mi_atomic_increment(&regions_count);
-          start = NULL;
-          break;
-        }
+    bool claimed = false;
+    for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
+      if (!is_zero) {
+        // set dirty bits before CAS; this might race with a zero block but that is ok. 
+        // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
+        mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
       }
-      if (start != NULL) {
-        // free it if we didn't succeed to save it to some other region
-        _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
-        // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
+      if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
+        // claimed!
+        regions[idx+i].arena_memid = arena_memid;
+        mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
+        mi_atomic_increment(&regions_count);
+        claimed = true;
       }
-      // and continue with the memory at our index
-      info = mi_atomic_read(&regions[idx].info);
     }
+    if (!claimed) {
+      // free our OS allocation if we didn't succeed to store it in some region
+      _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);      
+    }
+    // continue with the actual info at our index in case another thread was quicker with the allocation
+    info = mi_atomic_read(&regions[idx].info);
+    mi_assert_internal(info != 0);
   }
   mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
   mi_assert_internal(info != 0);
@@ -290,19 +284,21 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
     int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
     if (rnode != numa_node) return false;
   }
-  if (mi_unlikely(!(commit || allow_large))) {
-    // otherwise skip incompatible regions if possible. 
-    // this is not guaranteed due to multiple threads allocating at the same time but
-    // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-    // otherwise we might just not be able to reset/decommit individual pages sometimes.
-    mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
-    bool is_large;
-    bool is_committed;
-    void* start = mi_region_info_read(info, &is_large, &is_committed);
-    bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
-    if (!ok) return false;
-  }
-  return true;
+  if (commit && allow_large) return true;  // always ok
+
+  // otherwise skip incompatible regions if possible. 
+  // this is not guaranteed due to multiple threads allocating at the same time but
+  // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+  // otherwise we might just not be able to reset/decommit individual pages sometimes.
+  mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+  bool is_large;
+  bool is_committed;
+  void* start = mi_region_info_read(info, &is_large, &is_committed);
+  // note: we also skip if commit is false and the region is committed,
+  // that is a bit strong but prevents allocation of eager delayed segments in 
+  // committed memory
+  bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
+  return ok;
 }
 
 // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
diff --git a/src/page.c b/src/page.c
index 32b68edb..c5b6e370 100644
--- a/src/page.c
+++ b/src/page.c
@@ -497,8 +497,10 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
 {
   UNUSED(stats);
+  #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
   size_t bsize = page->block_size;
diff --git a/test/test-stress.c b/test/test-stress.c
index bb428072..d80cb1a4 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -66,7 +66,9 @@ static void* alloc_items(size_t items, random_t r) {
   if (chance(1, r)) items *= 100; // 1% huge objects;
   if (items==40) items++;              // pthreads uses that size for stack increases
   uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
-  for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  }
   return p;
 }
 

From 27f1a8b3d24acf0ff0bcbdacfbecd21437fb450e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:35:30 -0800
Subject: [PATCH 26/48] fix avg display; set secure default to 0`

---
 include/mimalloc-types.h | 2 +-
 src/stats.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index ddbe72f3..3f5e4e27 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 4  // experimental, may be more expensive: checks for double free. (cmake -DMI_SECURE_FULL=ON)
 
 #if !defined(MI_SECURE)
-#define MI_SECURE 4
+#define MI_SECURE 0
 #endif
 
 // Define MI_DEBUG for debug mode
diff --git a/src/stats.c b/src/stats.c
index 011fab64..cb6d8866 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -206,7 +206,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
   const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, "%10s: %5ld.%ld avg %ld %ld\n", msg, avg_whole, avg_frac1);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }
 
 

From 13f5e6e43e9aae4043d9acc94fac67746fcd9bb4 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 18:09:30 -0800
Subject: [PATCH 27/48] fix numa node check in regions

---
 src/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory.c b/src/memory.c
index bdbf1e48..fb3f5093 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -282,7 +282,7 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   if (m == MI_BITMAP_FIELD_FULL) return false;
   if (numa_node >= 0) {  // use negative numa node to always succeed
     int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
-    if (rnode != numa_node) return false;
+    if (rnode >= 0 && rnode != numa_node) return false;
   }
   if (commit && allow_large) return true;  // always ok
 

From 7b72a4cd50782563104e28becb7e181e8978449f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 8 Nov 2019 11:55:43 -0800
Subject: [PATCH 28/48] fix region suitable bug

---
 src/memory.c       | 6 +++---
 test/test-stress.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index bdbf1e48..f8798d99 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -281,8 +281,8 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
   if (m == MI_BITMAP_FIELD_FULL) return false;
   if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
-    if (rnode != numa_node) return false;
+    int rnode = ((int)mi_atomic_read_relaxed(&regions[idx].numa_node)) - 1;
+    if (rnode >= 0 && rnode != numa_node) return false;
   }
   if (commit && allow_large) return true;  // always ok
 
@@ -290,7 +290,7 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   // this is not guaranteed due to multiple threads allocating at the same time but
   // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
   // otherwise we might just not be able to reset/decommit individual pages sometimes.
-  mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+  mi_region_info_t info = mi_atomic_read_relaxed(&regions[idx].info);
   bool is_large;
   bool is_committed;
   void* start = mi_region_info_read(info, &is_large, &is_committed);
diff --git a/test/test-stress.c b/test/test-stress.c
index d80cb1a4..be2a9c67 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -18,7 +18,7 @@ terms of the MIT license.
 
 // argument defaults
 static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 20;    // scaling factor
+static int N       = 40;    // scaling factor
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int N       = 100;  // scaling factor

From 9f08ddd0d0d2909998d71bf6da9bce2b048d851e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Nov 2019 19:30:53 -0800
Subject: [PATCH 29/48] refactor regions; add commit tracking on a segment
 basis

---
 src/arena.c      |   9 +-
 src/bitmap.inc.c |  14 +-
 src/memory.c     | 382 ++++++++++++++++++++---------------------------
 src/segment.c    |   2 +-
 4 files changed, 181 insertions(+), 226 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 8feec89f..1b6cf4a4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -123,7 +123,7 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
   mi_bitmap_index_t bitmap_index;
   if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
     // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
     *memid   = mi_memid_create(arena_index, bitmap_index);
     *commit  = true;           // TODO: support commit on demand?
     *large   = arena->is_large;
@@ -181,7 +181,10 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 
   // finally, fall back to the OS
   *is_zero = true;
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
+  if (*large) {
+    *large = mi_option_is_enabled(mi_option_large_os_pages); // try large OS pages only if enabled and allowed
+  }
   return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
 }
 
@@ -288,7 +291,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   if (post > 0) {
     // don't use leftover bits at the end
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    mi_bitmap_claim(arena->blocks_map, fields, post, postidx); 
+    mi_bitmap_claim(arena->blocks_map, fields, post, postidx, NULL); 
   }
   
   mi_arena_add(arena);
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 19e6bbb8..3847e712 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -61,6 +61,7 @@ static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
   return ((((uintptr_t)1 << count) - 1) << bitidx);
 }
 
@@ -183,14 +184,25 @@ static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, s
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously
-static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   // mi_assert_internal((bitmap[idx] & mask) == 0);
   uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
   return ((prev & mask) == 0);
 }
 
+// Returns `true` if all `count` bits were 1
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
+}
+
 #endif
\ No newline at end of file
diff --git a/src/memory.c b/src/memory.c
index f8798d99..a1f94e18 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -65,10 +65,11 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
 #define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
-#define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
 #define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
-
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
 
 // Region info is a pointer to the memory region and two bits for 
 // its flags: is_large, and is_committed.
@@ -88,20 +89,16 @@ static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, b
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(mi_region_info_t) info;       // start of the memory area (and flags)
-  volatile _Atomic(uintptr_t)        numa_node;  // associated numa node + 1 (so 0 is no association)
-  size_t   arena_memid;                          // if allocated from a (huge page) arena
+  volatile _Atomic(mi_region_info_t) info;        // start of the memory area (and flags)
+  volatile _Atomic(uintptr_t)        numa_node;   // associated numa node + 1 (so 0 is no association)
+  mi_bitmap_field_t                  in_use;
+  mi_bitmap_field_t                  dirty;  
+  size_t                             arena_memid; // if allocated from a (huge page) arena
 } mem_region_t;
 
 // The region map
 static mem_region_t regions[MI_REGION_MAX];
 
-// A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
-static mi_bitmap_field_t regions_map[MI_REGION_MAX];
-
-// A bit mask per region to track which blocks are dirty (= potentially written to)
-static mi_bitmap_field_t regions_dirty[MI_REGION_MAX];
-
 // Allocated regions
 static volatile _Atomic(uintptr_t) regions_count; // = 0;        
 
@@ -112,8 +109,7 @@ Utility functions
 
 // Blocks (of 4MiB) needed for the given size.
 static size_t mi_region_block_count(size_t size) {
-  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
-  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
 }
 
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
@@ -134,8 +130,11 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 }
 
 
-static size_t mi_memid_create(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx<<1;
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
 }
 
 static size_t mi_memid_create_from_arena(size_t arena_memid) {
@@ -146,177 +145,149 @@ static bool mi_memid_is_arena(size_t id) {
   return ((id&1)==1);
 }
 
-static bool mi_memid_indices(size_t id, mi_bitmap_index_t* bitmap_idx, size_t* arena_memid) {
+static bool mi_memid_indices(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
   if (mi_memid_is_arena(id)) {
     *arena_memid = (id>>1);
     return true;
   }
   else {
-    *bitmap_idx = (mi_bitmap_index_t)(id>>1);
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
     return false;
   }
 }
 
 /* ----------------------------------------------------------------------------
-  Ensure a region is allocated from the OS (or an arena)
+  Allocate a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
 
-static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_info_t* pinfo, mi_os_tld_t* tld)
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld) 
 {
-  // ensure the region is reserved
-  mi_region_info_t info = mi_atomic_read(&regions[idx].info);
-  if (mi_unlikely(info == 0))
-  {
-    bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
-    bool region_large = allow_large;
-    bool is_zero = false;
-    size_t arena_memid = 0;
-    void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
-    mi_assert_internal(!(region_large && !allow_large));
+  // not out of regions yet?
+  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
 
-    if (start == NULL) {
-      // failure to allocate from the OS! fail
-      *pinfo = 0;
-      return false;
-    }
-
-    // set the newly allocated region
-    // try to initialize any region up to 4 beyond the current one in
-    // care multiple threads are doing this concurrently (common at startup)    
-    info = mi_region_info_create(start, region_large, region_commit);
-    bool claimed = false;
-    for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
-      if (!is_zero) {
-        // set dirty bits before CAS; this might race with a zero block but that is ok. 
-        // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
-        mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
-      }
-      if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-        // claimed!
-        regions[idx+i].arena_memid = arena_memid;
-        mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
-        mi_atomic_increment(&regions_count);
-        claimed = true;
-      }
-    }
-    if (!claimed) {
-      // free our OS allocation if we didn't succeed to store it in some region
-      _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);      
-    }
-    // continue with the actual info at our index in case another thread was quicker with the allocation
-    info = mi_atomic_read(&regions[idx].info);
-    mi_assert_internal(info != 0);
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large  = (commit && allow_large);  
+  bool is_zero       = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  
+  // claim a fresh slot
+  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+    return false;
   }
-  mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
-  mi_assert_internal(info != 0);
-  *pinfo = info;
+
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->numa_node = _mi_os_numa_node(tld) + 1;
+  r->arena_memid = arena_memid;
+  *bit_idx = 0;
+  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_write(&r->info, mi_region_info_create(start, region_large, region_commit)); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool commit, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
+  if (info==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // note: we also skip if commit is false and the region is committed,
+  // that is a bit strong but prevents allocation of eager-delayed segments in an eagerly committed region
+  bool is_large;
+  bool is_committed;
+  mi_region_info_read(info, &is_large, &is_committed);  
+  
+  if (!commit && is_committed) return false;
+  if (!allow_large && is_large) return false;
   return true;
 }
 
 
-/* ----------------------------------------------------------------------------
-  Commit blocks
------------------------------------------------------------------------------*/
-
-static void* mi_region_commit_blocks(mi_bitmap_index_t bitmap_idx, mi_region_info_t info, size_t blocks, size_t size, bool* commit, bool* is_large, bool* is_zero, mi_os_tld_t* tld)
+static bool mi_region_try_claim(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
-  // set dirty bits
-  *is_zero = mi_bitmap_claim(regions_dirty, MI_REGION_MAX, blocks, bitmap_idx);
+  // try all regions for a free slot
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    if (mi_region_is_suitable(r, numa_node, commit, allow_large)) {
+      if (mi_bitmap_try_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
+      }
+    }
+  }
+  return false;
+}
 
-  // Commit the blocks to memory
+
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  // first try to claim in existing regions
+  if (!mi_region_try_claim(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region
+    if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
+  }
+  
+  // found a region and claimed `blocks` at `bit_idx`
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+
+  mi_region_info_t info = mi_atomic_read(&region->info);
   bool region_is_committed = false;
   bool region_is_large = false;
   void* start = mi_region_info_read(info, &region_is_large, &region_is_committed);
   mi_assert_internal(!(region_is_large && !*is_large));
-  mi_assert_internal(start!=NULL);
+  mi_assert_internal(start != NULL);
 
-  void* blocks_start = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bitmap_idx) * MI_SEGMENT_SIZE);
-  if (*commit && !region_is_committed) {
-    // ensure commit 
-    bool commit_zero = false;
-    _mi_os_commit(blocks_start, mi_good_commit_size(size), &commit_zero, tld->stats);  // only commit needed size (unless using large OS pages)
-    if (commit_zero) *is_zero = true;
-  }
-  else if (!*commit && region_is_committed) {
-    // but even when no commit is requested, we might have committed anyway (in a huge OS page for example)
-    *commit = true;
-  }
-
-  // and return the allocation  
-  mi_assert_internal(blocks_start != NULL);
+  bool any_zero = false;
+  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, &any_zero);
+  if (!mi_option_is_enabled(mi_option_eager_commit)) any_zero = true; // if no eager commit, even dirty segments may be partially committed
   *is_large = region_is_large;
-  return blocks_start;
+  *memid = mi_memid_create(region, bit_idx);
+  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+  if (*commit && !region_is_committed && any_zero) { // want to commit, but not yet fully committed?
+    // ensure commit 
+    _mi_os_commit(p, blocks * MI_SEGMENT_SIZE, is_zero, tld->stats);  
+  }
+  else {
+    *commit = region_is_committed || !any_zero;
+  }
+  
+  
+  // and return the allocation  
+  mi_assert_internal(p != NULL);  
+  return p;
 }
 
-/* ----------------------------------------------------------------------------
-  Claim and allocate blocks in a region
------------------------------------------------------------------------------*/
-
-static bool mi_region_alloc_blocks(
-  size_t idx, size_t blocks, size_t size,
-  bool* commit, bool* allow_large, bool* is_zero,
-  void** p, size_t* id, mi_os_tld_t* tld)
-{
-  mi_bitmap_index_t bitmap_idx;
-  if (!mi_bitmap_try_claim_field(regions_map, idx, blocks, &bitmap_idx)) {
-    return true; // no error, but also no success
-  }
-  mi_region_info_t info;
-  if (!mi_region_ensure_allocated(idx,*allow_large,&info,tld)) {
-    // failed to allocate region memory, unclaim the bits and fail
-    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
-    return false;
-  }
-  *p = mi_region_commit_blocks(bitmap_idx,info,blocks,size,commit,allow_large,is_zero,tld);
-  *id = mi_memid_create(bitmap_idx);
-  return true;
-}
-
-
-/* ----------------------------------------------------------------------------
-  Try to allocate blocks in suitable regions
------------------------------------------------------------------------------*/
-
-static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool allow_large ) {
-  uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
-  if (m == MI_BITMAP_FIELD_FULL) return false;
-  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = ((int)mi_atomic_read_relaxed(&regions[idx].numa_node)) - 1;
-    if (rnode >= 0 && rnode != numa_node) return false;
-  }
-  if (commit && allow_large) return true;  // always ok
-
-  // otherwise skip incompatible regions if possible. 
-  // this is not guaranteed due to multiple threads allocating at the same time but
-  // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-  // otherwise we might just not be able to reset/decommit individual pages sometimes.
-  mi_region_info_t info = mi_atomic_read_relaxed(&regions[idx].info);
-  bool is_large;
-  bool is_committed;
-  void* start = mi_region_info_read(info, &is_large, &is_committed);
-  // note: we also skip if commit is false and the region is committed,
-  // that is a bit strong but prevents allocation of eager delayed segments in 
-  // committed memory
-  bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
-  return ok;
-}
-
-// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(
-  int numa_node, size_t idx, size_t blocks, size_t size,
-  bool* commit, bool* allow_large, bool* is_zero,
-  void** p, size_t* id, mi_os_tld_t* tld)
-{
-  // check if there are available blocks in the region..
-  mi_assert_internal(idx < MI_REGION_MAX);
-  if (mi_region_is_suitable(numa_node, idx, *commit, *allow_large)) {
-    return mi_region_alloc_blocks(idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
-  }
-  return true;  // no error, but no success either
-}
 
 /* ----------------------------------------------------------------------------
  Allocation
@@ -324,63 +295,35 @@ static bool mi_region_try_alloc_blocks(
 
 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, 
-                            size_t* id, mi_os_tld_t* tld)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
-  mi_assert_internal(id != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *id = 0;
+  *memid = 0;
   *is_zero = false;
   bool default_large = false;
   if (large==NULL) large = &default_large;  // ensure `large != NULL`  
-
-  // use direct OS allocation for huge blocks or alignment 
-  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
-    size_t arena_memid = 0;
-    void* p = _mi_arena_alloc_aligned(mi_good_commit_size(size), alignment, commit, large, is_zero, &arena_memid, tld);  // round up size
-    *id = mi_memid_create_from_arena(arena_memid);
-    return p;
-  }
-
-  // always round size to OS page size multiple (so commit/decommit go over the entire range)
-  // TODO: use large OS page size here?
+  if (size == 0) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // calculate the number of needed blocks
+  // allocate from regions if possible
+  size_t arena_memid;
   const size_t blocks = mi_region_block_count(size);
-  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
-
-  // find a range of free blocks
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
-  void* p = NULL;
-  const size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-    if (p != NULL) break;
-  }
-
-  if (p == NULL) {
-    // no free range in existing regions -- try to extend beyond the count.. but at most 8 regions
-    for (idx = count; idx < mi_atomic_read_relaxed(&regions_count) + 8 && idx < MI_REGION_MAX; idx++) {
-      if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-      if (p != NULL) break;
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
+    void* p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);
+    mi_assert_internal(p == NULL || (uintptr_t)p % alignment == 0);    
+    if (p != NULL) {
+      if (*commit) { ((uint8_t*)p)[0] = 0; }
+      return p;
     }
+    _mi_warning_message("unable to allocate from region: size %zu\n", size);
   }
 
-  if (p == NULL) {
-    // we could not find a place to allocate, fall back to the os directly
-    _mi_warning_message("unable to allocate from region: size %zu\n", size);    
-    size_t arena_memid = 0;
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
-    *id = mi_memid_create_from_arena(arena_memid);
-  }
-  else {
-    tld->region_idx = idx;  // next start of search
-  }
-
+  // and otherwise fall back to the OS
+  void* p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+  *memid = mi_memid_create_from_arena(arena_memid);
   mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
+  if (p != NULL && *commit) { ((uint8_t*)p)[0] = 0; }
   return p;
 }
 
@@ -396,31 +339,28 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
   if (p==NULL) return;
   if (size==0) return;
   size_t arena_memid = 0;
-  mi_bitmap_index_t bitmap_idx;
-  if (mi_memid_indices(id,&bitmap_idx,&arena_memid)) {
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_indices(id,&region,&bit_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
     _mi_arena_free(p, size, arena_memid, stats);
   }
   else {
     // allocated in a region
-    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
     size = _mi_align_up(size, _mi_os_page_size());    
-    const size_t blocks = mi_region_block_count(size);
-    const size_t idx    = mi_bitmap_index_field(bitmap_idx);
-    const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
-    mem_region_t* region = &regions[idx];
+    const size_t blocks = mi_region_block_count(size);    
     mi_region_info_t info = mi_atomic_read(&region->info);
     bool is_large;
     bool is_eager_committed;
     void* start = mi_region_info_read(info,&is_large,&is_eager_committed);
     mi_assert_internal(start != NULL);
-    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+    void* blocks_start = (uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bitidx + blocks <= MI_BITMAP_FIELD_BITS);
-    if (blocks_start != p || bitidx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
 
     // decommit (or reset) the blocks to reduce the working set.
     // TODO: implement delayed decommit/reset as these calls are too expensive
@@ -446,7 +386,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // this frees up virtual address space which might be useful on 32-bit systems?
 
     // and unclaim
-    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
+    mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
   }
 }
 
@@ -456,13 +396,15 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 -----------------------------------------------------------------------------*/
 void _mi_mem_collect(mi_stats_t* stats) {
   // free every region that has no segments in use.
-  for (size_t i = 0; i < regions_count; i++) {
-    if (mi_atomic_read_relaxed(&regions_map[i]) == 0) {
+  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_read_relaxed(&region->info) != 0) {
       // if no segments used, try to claim the whole region
       uintptr_t m;
       do {
-        m = mi_atomic_read_relaxed(&regions_map[i]);
-      } while(m == 0 && !mi_atomic_cas_weak(&regions_map[i], MI_BITMAP_FIELD_FULL, 0 ));
+        m = mi_atomic_read_relaxed(&region->in_use);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
       if (m == 0) {
         // on success, free the whole region
         bool is_eager_committed;
@@ -471,9 +413,7 @@ void _mi_mem_collect(mi_stats_t* stats) {
           _mi_arena_free(start, MI_REGION_SIZE, regions[i].arena_memid, stats);
         }
         // and release
-        mi_atomic_write(&regions[i].info,0);
-        mi_atomic_write(&regions_dirty[i],0);
-        mi_atomic_write(&regions_map[i],0);
+        mi_atomic_write(&region->info,0);
       }
     }
   }
diff --git a/src/segment.c b/src/segment.c
index 178e0eda..b2b37fac 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -370,7 +370,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
     }
     segment->memid = memid;
     segment->mem_is_fixed = mem_large;
-    segment->mem_is_committed = commit;
+    segment->mem_is_committed = commit;    
     mi_segments_track_size((long)segment_size, tld);
   }
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);

From d2279b2a3faf7c2e084644449326306ef8d4f619 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 10 Nov 2019 08:13:40 -0800
Subject: [PATCH 30/48] update test-stress with better object distribution

---
 test/test-stress.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index be2a9c67..37572d42 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -17,8 +17,8 @@ terms of the MIT license.
 #include <mimalloc.h>
 
 // argument defaults
-static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 40;    // scaling factor
+static int THREADS = 8;    // more repeatable if THREADS <= #processors
+static int N       = 200;    // scaling factor
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int N       = 100;  // scaling factor
@@ -63,7 +63,11 @@ static bool chance(size_t perc, random_t r) {
 }
 
 static void* alloc_items(size_t items, random_t r) {
-  if (chance(1, r)) items *= 100; // 1% huge objects;
+  if (chance(1, r)) {
+    if (chance(1, r)) items *= 1000;       // 0.01% giant
+    else if (chance(10, r)) items *= 100;  // 0.1% huge
+    else items *= 10;                      // 1% large objects;
+  }
   if (items==40) items++;              // pthreads uses that size for stack increases
   uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
   if (p != NULL) {

From 21bbb1be870c8b9bd6ca057257a4cbb0ec57e6e5 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 10 Nov 2019 12:36:55 -0800
Subject: [PATCH 31/48] fix warnings

---
 src/init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/init.c b/src/init.c
index 7e704e7a..d5ec03c2 100644
--- a/src/init.c
+++ b/src/init.c
@@ -100,8 +100,8 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
-  { 0, tld_main_stats },   // os
-  { MI_STATS_NULL }        // stats
+  { 0, tld_main_stats, {{0,NULL,0}} },   // os
+  { MI_STATS_NULL }                      // stats
 };
 
 mi_heap_t _mi_heap_main = {

From 83a066fd2d0d7484abf6372e41ac777c721c761a Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 09:46:02 -0800
Subject: [PATCH 32/48] remove reset_decommits option

---
 include/mimalloc.h |  3 +--
 src/memory.c       | 28 ++++++++++++----------------
 src/options.c      |  7 +++----
 src/os.c           | 20 +++-----------------
 4 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 70b6e412..4c542ee0 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -272,9 +272,8 @@ typedef enum mi_option_e {
   mi_option_segment_cache,
   mi_option_page_reset,
   mi_option_cache_reset,
-  mi_option_reset_decommits,
-  mi_option_eager_commit_delay,
   mi_option_segment_reset,
+  mi_option_eager_commit_delay,
   mi_option_os_tag,
   mi_option_max_numa_node,
   mi_option_max_errors,
diff --git a/src/memory.c b/src/memory.c
index a1f94e18..ceb9a702 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -350,12 +350,12 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
-    size = _mi_align_up(size, _mi_os_page_size());    
-    const size_t blocks = mi_region_block_count(size);    
+    size = _mi_align_up(size, _mi_os_page_size());
+    const size_t blocks = mi_region_block_count(size);
     mi_region_info_t info = mi_atomic_read(&region->info);
     bool is_large;
-    bool is_eager_committed;
-    void* start = mi_region_info_read(info,&is_large,&is_eager_committed);
+    bool is_committed;
+    void* start = mi_region_info_read(info, &is_large, &is_committed);
     mi_assert_internal(start != NULL);
     void* blocks_start = (uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
@@ -366,18 +366,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // TODO: implement delayed decommit/reset as these calls are too expensive
     // if the memory is reused soon.
     // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
-    if (!is_large) {
-      if (mi_option_is_enabled(mi_option_segment_reset)) {
-        if (!is_eager_committed &&  // cannot reset large pages
-          (mi_option_is_enabled(mi_option_eager_commit) ||  // cannot reset halfway committed segments, use `option_page_reset` instead
-            mi_option_is_enabled(mi_option_reset_decommits))) // but we can decommit halfway committed segments
-        {
-          _mi_os_reset(p, size, stats);
-          //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
-        }
-      }
-    }    
-    if (!is_eager_committed) {
+    if (!is_large &&
+        mi_option_is_enabled(mi_option_segment_reset) &&
+        mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use `option_page_reset` instead            
+    {
+      _mi_os_reset(p, size, stats);
+      //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
+    }
+    if (!is_committed) {
       // adjust commit statistics as we commit again when re-using the same slot
       _mi_stat_decrease(&stats->committed, mi_good_commit_size(size));
     }
diff --git a/src/options.c b/src/options.c
index 63b1612a..75a2736a 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,11 +65,10 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },
+  { 1, UNINIT, MI_OPTION(page_reset) },
   { 0, UNINIT, MI_OPTION(cache_reset) },
-  { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 256, UNINIT, MI_OPTION(max_numa_node) },     // maximum allowed numa node
   { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
@@ -88,7 +87,7 @@ void _mi_options_init(void) {
       mi_option_desc_t* desc = &options[option];
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
     }
-  }
+  }  
   mi_max_error_count = mi_option_get(mi_option_max_errors);
 }
 
diff --git a/src/os.c b/src/os.c
index 027df6ab..5229381b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -646,10 +646,6 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
   return mi_os_commitx(addr, size, false, true /* conservative? */, &is_zero, stats);
 }
 
-bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, true /* conservative? */, is_zero, stats);
-}
-
 
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
@@ -708,22 +704,12 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
 bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_decommit(addr,size,stats);
-  }
-  else {
-    return mi_os_resetx(addr, size, true, stats);
-  }
+  return mi_os_resetx(addr, size, true, stats);
 }
 
 bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
-  }
-  else {
-    *is_zero = false;
-    return mi_os_resetx(addr, size, false, stats);
-  }
+  *is_zero = false;
+  return mi_os_resetx(addr, size, false, stats);
 }
 
 

From 93a646338343984b86b00b1c7852322eafa7190e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 14:16:45 -0800
Subject: [PATCH 33/48] only allow commit delay for small and medium objects

---
 src/options.c | 2 +-
 src/segment.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/options.c b/src/options.c
index 75a2736a..dbb7df79 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 1, UNINIT, MI_OPTION(page_reset) },
+  { 0, UNINIT, MI_OPTION(page_reset) },
   { 0, UNINIT, MI_OPTION(cache_reset) },
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
diff --git a/src/segment.c b/src/segment.c
index b2b37fac..d089078c 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -328,9 +328,9 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);
 
   // Try to get it from our thread local cache first
-  bool eager_delay = (tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  bool eager  = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (page_kind > MI_PAGE_MEDIUM);
+  bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (page_kind >= MI_PAGE_LARGE);
   bool protection_still_good = false;
   bool is_zero = false;
   mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
@@ -359,7 +359,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   else {
     // Allocate the segment from the OS
     size_t memid;
-    bool   mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+    bool   mem_large = (!eager_delayed && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
     segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_zero, &memid, os_tld);
     if (segment == NULL) return NULL;  // failed to allocate
     if (!commit) {

From 534e1e39ef29946e502fd0f668d2dc80ffd141da Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 14:42:29 -0800
Subject: [PATCH 34/48] allow allocation in committed regions even if not
 requested

---
 src/memory.c  | 6 ++----
 src/options.c | 4 ++--
 src/segment.c | 4 +++-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index ceb9a702..24239e05 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -210,14 +210,12 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
     if (rnode >= 0 && rnode != numa_node) return false;
   }
 
-  // note: we also skip if commit is false and the region is committed,
-  // that is a bit strong but prevents allocation of eager-delayed segments in an eagerly committed region
+  // check allow-large
   bool is_large;
   bool is_committed;
   mi_region_info_read(info, &is_large, &is_committed);  
-  
-  if (!commit && is_committed) return false;
   if (!allow_large && is_large) return false;
+
   return true;
 }
 
diff --git a/src/options.c b/src/options.c
index dbb7df79..694b916b 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,8 +65,8 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },
-  { 0, UNINIT, MI_OPTION(cache_reset) },
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 0, UNINIT, MI_OPTION(cache_reset) },         // reset segment cache on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
diff --git a/src/segment.c b/src/segment.c
index d089078c..eb5a0390 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -327,12 +327,14 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   mi_assert_internal(segment_size >= required);
   size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);
 
-  // Try to get it from our thread local cache first
+  // Initialize parameters
   bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
   bool commit = eager || (page_kind >= MI_PAGE_LARGE);
   bool protection_still_good = false;
   bool is_zero = false;
+  
+  // Try to get it from our thread local cache first
   mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
   if (segment != NULL) {
     if (MI_SECURE!=0) {

From 2bb058bd25258c2e7a9fb2c1a64400ec780c2912 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 14:44:32 -0800
Subject: [PATCH 35/48] remove cache_reset parameter

---
 include/mimalloc.h | 1 -
 src/options.c      | 1 -
 src/segment.c      | 6 +-----
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 4c542ee0..6df889a4 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -271,7 +271,6 @@ typedef enum mi_option_e {
   mi_option_reserve_huge_os_pages,
   mi_option_segment_cache,
   mi_option_page_reset,
-  mi_option_cache_reset,
   mi_option_segment_reset,
   mi_option_eager_commit_delay,
   mi_option_os_tag,
diff --git a/src/options.c b/src/options.c
index 694b916b..1231e1c9 100644
--- a/src/options.c
+++ b/src/options.c
@@ -66,7 +66,6 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
   { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
-  { 0, UNINIT, MI_OPTION(cache_reset) },         // reset segment cache on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
diff --git a/src/segment.c b/src/segment.c
index eb5a0390..ef24c660 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -280,9 +280,6 @@ static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld)
     return false;
   }
   mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_cache_reset)) {
-    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
-  }
   segment->next = tld->cache;
   tld->cache = segment;
   tld->cache_count++;
@@ -351,8 +348,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
       _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->stats);
       segment->mem_is_committed = true;
     }
-    if (!segment->mem_is_fixed &&
-        (mi_option_is_enabled(mi_option_cache_reset) || mi_option_is_enabled(mi_option_page_reset))) {
+    if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_page_reset)) {
       bool reset_zero = false;
       _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->stats);
       if (reset_zero) is_zero = true;

From db3f1c4bfadcb7007357fd61d7dc24369ae8fe31 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 15:26:11 -0800
Subject: [PATCH 36/48] add commit info to arenas

---
 src/arena.c | 66 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 1b6cf4a4..02890bd6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -33,6 +33,7 @@ of 256MiB in practice.
 
 #include "bitmap.inc.c"  // atomic bitmap
 
+
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
@@ -40,6 +41,7 @@ void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
 void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
 
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); 
 int   _mi_os_numa_node_count(void);
 
 /* -----------------------------------------------------------
@@ -56,13 +58,15 @@ int   _mi_os_numa_node_count(void);
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
   int      numa_node;                     // associated NUMA node
   bool     is_zero_init;                  // is the arena zero initialized?
+  bool     is_committed;                  // is the memory committed
   bool     is_large;                      // large OS page allocated
   volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
+  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
+  mi_bitmap_field_t  blocks_inuse[1];       // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
 
@@ -104,7 +108,7 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
   size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
   for (size_t visited = 0; visited < fcount; visited++, idx++) {
     if (idx >= fcount) idx = 0;  // wrap around
-    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
+    if (mi_bitmap_try_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
       mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
       return true;
     }
@@ -118,31 +122,46 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
 ----------------------------------------------------------- */
 
 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
-                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) 
 {
   mi_bitmap_index_t bitmap_index;
-  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
-    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-    *memid   = mi_memid_create(arena_index, bitmap_index);
-    *commit  = true;           // TODO: support commit on demand?
-    *large   = arena->is_large;
-    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p  = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid   = mi_memid_create(arena_index, bitmap_index);
+  *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large   = arena->is_large;
+  if (arena->is_committed) {
+    // always committed
+    *commit = true;
   }
-  return NULL;
+  else if (commit) {
+    // ensure commit now
+    bool any_zero;
+    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_zero);
+    if (any_zero) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
 }
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
                               bool* commit, bool* large, bool* is_zero, 
                               size_t* memid, mi_os_tld_t* tld) 
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(commit != NULL && large != NULL && is_zero != NULL && memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
   *memid   = MI_MEMID_OS;
   *is_zero = false;
-  bool default_large = false;
-  if (large==NULL) large = &default_large;  // ensure `large != NULL`
-
+  
   // try to allocate in an arena if the alignment is small enough
   // and the object is not too large or too small.
   if (alignment <= MI_SEGMENT_ALIGN && 
@@ -160,7 +179,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
       if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
           (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
       { 
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
         mi_assert_internal((uintptr_t)p % alignment == 0);
         if (p != NULL) return p;
       }
@@ -172,7 +191,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
       if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
           (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
       {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
         mi_assert_internal((uintptr_t)p % alignment == 0);
         if (p != NULL) return p;
       }
@@ -182,9 +201,6 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
   // finally, fall back to the OS
   *is_zero = true;
   *memid   = MI_MEMID_OS;
-  if (*large) {
-    *large = mi_option_is_enabled(mi_option_large_os_pages); // try large OS pages only if enabled and allowed
-  }
   return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
 }
 
@@ -223,7 +239,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
       return;
     }
     const size_t blocks = mi_block_count_of_size(size);
-    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
+    bool ones = mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!ones) {
       _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
       return;
@@ -283,15 +299,17 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large = true;
   arena->is_zero_init = true;
+  arena->is_committed = true;
   arena->search_idx = 0;
-  arena->blocks_dirty = &arena->blocks_map[bcount];
+  arena->blocks_dirty = &arena->blocks_inuse[bcount];
+  arena->blocks_committed = NULL;
   // the bitmaps are already zero initialized due to os_alloc
   // just claim leftover blocks if needed
   size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   if (post > 0) {
     // don't use leftover bits at the end
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    mi_bitmap_claim(arena->blocks_map, fields, post, postidx, NULL); 
+    mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); 
   }
   
   mi_arena_add(arena);

From 5e6754f3f7905485ca74546ab082f4c3bc5404fd Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 15:45:31 -0800
Subject: [PATCH 37/48] track commit status per block in a region

---
 src/memory.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index 208b9b7e..8299bbc2 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -59,7 +59,7 @@ static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 40KiB for the region map 
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 48KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
 #define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
 #else
@@ -94,8 +94,9 @@ static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, b
 typedef struct mem_region_s {
   volatile _Atomic(mi_region_info_t) info;        // start of the memory area (and flags)
   volatile _Atomic(uintptr_t)        numa_node;   // associated numa node + 1 (so 0 is no association)
-  mi_bitmap_field_t                  in_use;
-  mi_bitmap_field_t                  dirty;  
+  mi_bitmap_field_t                  in_use;      // bit per in-use block
+  mi_bitmap_field_t                  dirty;       // track if non-zero per block
+  mi_bitmap_field_t                  commit;      // track if committed per block (if `!info.is_committed))
   size_t                             arena_memid; // if allocated from a (huge page) arena
 } mem_region_t;
 
@@ -165,20 +166,20 @@ static bool mi_memid_indices(size_t id, mem_region_t** region, mi_bitmap_index_t
   Allocate a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
 
-static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld) 
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
   // not out of regions yet?
   if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
 
   // try to allocate a fresh region from the OS
   bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
-  bool region_large  = (commit && allow_large);  
-  bool is_zero       = false;
+  bool region_large = (commit && allow_large);
+  bool is_zero = false;
   size_t arena_memid = 0;
   void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
   if (start == NULL) return false;
   mi_assert_internal(!(region_large && !allow_large));
-  
+
   // claim a fresh slot
   const uintptr_t idx = mi_atomic_increment(&regions_count);
   if (idx >= MI_REGION_MAX) {
@@ -191,8 +192,13 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   mem_region_t* r = &regions[idx];
   r->numa_node = _mi_os_numa_node(tld) + 1;
   r->arena_memid = arena_memid;
+  mi_atomic_write(&r->in_use, 0);
+  mi_atomic_write(&r->dirty, (is_zero ? 0 : ~0UL));
+  mi_atomic_write(&r->commit, (region_commit ? ~0UL : 0));
   *bit_idx = 0;
   mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+
+  // and share it 
   mi_atomic_write(&r->info, mi_region_info_create(start, region_large, region_commit)); // now make it available to others
   *region = r;
   return true;
@@ -269,20 +275,28 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
   mi_assert_internal(!(region_is_large && !*is_large));
   mi_assert_internal(start != NULL);
 
-  bool any_zero = false;
-  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, &any_zero);
-  if (!mi_option_is_enabled(mi_option_eager_commit)) any_zero = true; // if no eager commit, even dirty segments may be partially committed
+  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
   *is_large = region_is_large;
   *memid = mi_memid_create(region, bit_idx);
   void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
-  if (*commit && !region_is_committed && any_zero) { // want to commit, but not yet fully committed?
-    // ensure commit 
-    _mi_os_commit(p, blocks * MI_SEGMENT_SIZE, is_zero, tld->stats);  
+  if (region_is_committed) {
+    // always committed
+    *commit = true;
+  }
+  else if (*commit) {
+    // ensure commit
+    bool any_zero;
+    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_zero);
+    if (any_zero) {
+      bool commit_zero;
+      _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
+      if (commit_zero) *is_zero = true;
+    }
   }
   else {
-    *commit = region_is_committed || !any_zero;
-  }
-  
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+  }  
   
   // and return the allocation  
   mi_assert_internal(p != NULL);  
@@ -374,7 +388,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld) {
         mi_option_is_enabled(mi_option_segment_reset) &&
         mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use `option_page_reset` instead            
     {
-      _mi_os_reset(p, size, tld->stats);      
+      // note: don't use `_mi_mem_reset` as it is shared with other threads!
+      _mi_os_reset(p, size, tld->stats);    // TODO: maintain reset bits to unreset  
     }
     if (!is_committed) {
       // adjust commit statistics as we commit again when re-using the same slot

From a0958b2da696a308f8c200f45f08bf1ab3e5f14b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 17:06:16 -0800
Subject: [PATCH 38/48] enable more reset delay slots

---
 include/mimalloc-types.h | 13 ++++++--
 src/init.c               |  9 ++++--
 src/memory.c             | 70 ++++++++++++++++++++++++++--------------
 src/options.c            |  2 +-
 src/segment.c            |  4 ++-
 5 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 2651fc85..0ce91339 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -390,13 +390,20 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 // ------------------------------------------------------
 typedef int64_t mi_msecs_t;
 
+#define MI_RESET_DELAY_SLOTS (256)
+
 typedef struct mi_delay_slot_s {
   mi_msecs_t expire;
   uint8_t*   addr;
   size_t     size;
 } mi_delay_slot_t;
 
-#define MI_RESET_DELAY_SLOTS (128)
+typedef struct mi_delay_slots_s {
+  size_t     capacity; // always `MI_RESET_DELAY_SLOTS`
+  size_t     count;    // current slots used (`<= capacity`)
+  mi_delay_slot_t slots[MI_RESET_DELAY_SLOTS];
+} mi_delay_slots_t;
+
 
 // ------------------------------------------------------
 // Thread Local data
@@ -411,8 +418,8 @@ typedef struct mi_segment_queue_s {
 // OS thread local data
 typedef struct mi_os_tld_s {
   size_t              region_idx;   // start point for next allocation
-  mi_stats_t* stats;        // points to tld stats
-  mi_delay_slot_t     reset_delay[MI_RESET_DELAY_SLOTS];
+  mi_delay_slots_t*   reset_delay;  // delay slots for OS reset operations
+  mi_stats_t*         stats;        // points to tld stats
 } mi_os_tld_t;
 
 // Segments thread local data
diff --git a/src/init.c b/src/init.c
index d5ec03c2..c9700cd5 100644
--- a/src/init.c
+++ b/src/init.c
@@ -100,8 +100,8 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
-  { 0, tld_main_stats, {{0,NULL,0}} },   // os
-  { MI_STATS_NULL }                      // stats
+  { 0, NULL, tld_main_stats },  // os
+  { MI_STATS_NULL }             // stats
 };
 
 mi_heap_t _mi_heap_main = {
@@ -192,6 +192,7 @@ uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {
 typedef struct mi_thread_data_s {
   mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
   mi_tld_t   tld;
+  mi_delay_slots_t reset_delay;
 } mi_thread_data_t;
 
 // Initialize the thread local default heap, called from `mi_thread_init`
@@ -211,6 +212,7 @@ static bool _mi_heap_init(void) {
     }
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
+    mi_delay_slots_t* reset_delay = &td->reset_delay;
     memcpy(heap, &_mi_heap_empty, sizeof(*heap));
     heap->thread_id = _mi_thread_id();
     heap->random = _mi_random_init(heap->thread_id);
@@ -221,6 +223,9 @@ static bool _mi_heap_init(void) {
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
     tld->os.stats = &tld->stats;
+    tld->os.reset_delay = reset_delay;
+    memset(reset_delay, 0, sizeof(*reset_delay));
+    reset_delay->capacity = MI_RESET_DELAY_SLOTS;
     _mi_heap_default = heap;
   }
   return false;
diff --git a/src/memory.c b/src/memory.c
index 8299bbc2..f3052d6b 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -54,7 +54,7 @@ void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, s
 void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 
 // local
-static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_t size);
+static bool mi_delay_remove(mi_delay_slots_t* delay_slots, void* p, size_t size);
 
 
 // Constants
@@ -208,7 +208,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   Try to claim blocks in suitable regions
 -----------------------------------------------------------------------------*/
 
-static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool commit, bool allow_large ) {
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
   // initialized at all?
   mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
   if (info==0) return false;
@@ -229,7 +229,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
 }
 
 
-static bool mi_region_try_claim(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+static bool mi_region_try_claim(size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
   // try all regions for a free slot
   const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
@@ -238,7 +238,7 @@ static bool mi_region_try_claim(size_t blocks, bool commit, bool allow_large, me
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
     mem_region_t* r = &regions[idx];
-    if (mi_region_is_suitable(r, numa_node, commit, allow_large)) {
+    if (mi_region_is_suitable(r, numa_node, allow_large)) {
       if (mi_bitmap_try_claim_field(&r->in_use, 0, blocks, bit_idx)) {
         tld->region_idx = idx;    // remember the last found position
         *region = r;
@@ -256,7 +256,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
   mem_region_t* region;
   mi_bitmap_index_t bit_idx;
   // first try to claim in existing regions
-  if (!mi_region_try_claim(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+  if (!mi_region_try_claim(blocks, *is_large, &region, &bit_idx, tld)) {
     // otherwise try to allocate a fresh region
     if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
       // out of regions or memory
@@ -354,7 +354,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld) {
   if (p==NULL) return;
   if (size==0) return;
 
-  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+  mi_delay_remove(tld->reset_delay, p, size);
 
   size_t arena_memid = 0;
   mi_bitmap_index_t bit_idx;
@@ -424,7 +424,7 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
         bool is_eager_committed;
         void* start = mi_region_info_read(mi_atomic_read(&regions[i].info), NULL, &is_eager_committed);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, start, MI_REGION_SIZE);
+          mi_delay_remove(tld->reset_delay, start, MI_REGION_SIZE);
           _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, tld->stats);
         }
         // and release
@@ -440,21 +440,22 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
 
 typedef void (mi_delay_resolve_fun)(void* addr, size_t size, void* arg);
 
-static void mi_delay_insert(mi_delay_slot_t* slots, size_t count,
+static void mi_delay_insert(mi_delay_slots_t* ds,
   mi_msecs_t delay, uint8_t* addr, size_t size,
   mi_delay_resolve_fun* resolve, void* arg)
 {
-  if (delay==0) {
+  if (ds == NULL || delay==0 || addr==NULL || size==0) {
     resolve(addr, size, arg);
     return;
   }
 
   mi_msecs_t now = _mi_clock_now();
-  mi_delay_slot_t* oldest = slots;
+  mi_delay_slot_t* oldest = &ds->slots[0];
   // walk through all slots, resolving expired ones.
   // remember the oldest slot to insert the new entry in.
-  for (size_t i = 0; i < count; i++) {
-    mi_delay_slot_t* slot = &slots[i];
+  size_t newcount = 0;
+  for (size_t i = 0; i < ds->count; i++) {
+    mi_delay_slot_t* slot = &ds->slots[i];
     
     if (slot->expire == 0) {
       // empty slot
@@ -480,26 +481,40 @@ static void mi_delay_insert(mi_delay_slot_t* slots, size_t count,
     }
     else if (oldest->expire > slot->expire) {  
       oldest = slot;
+      newcount = i+1;
+    }
+    else {
+      newcount = i+1;
     }
   }
+  ds->count = newcount;
   if (delay>0) {
-    // not yet registered, use the oldest slot
-    if (oldest->expire > 0) { 
+    // not yet registered, use the oldest slot (or a new one if there is space)
+    if (ds->count < ds->capacity) {
+      oldest = &ds->slots[ds->count];
+      ds->count++;
+    }
+    else if (oldest->expire > 0) { 
       resolve(oldest->addr, oldest->size, arg);  // evict if not empty
     }
+    mi_assert_internal((oldest - ds->slots) < (ptrdiff_t)ds->count);
     oldest->expire = now + delay;
     oldest->addr = addr;
     oldest->size = size;
   }
 }
 
-static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_t size)
+static bool mi_delay_remove(mi_delay_slots_t* ds, void* p, size_t size)
 {
+  if (ds == NULL || p==NULL || size==0) return false; 
+  
   uint8_t* addr = (uint8_t*)p;
   bool done = false;
-  // walk through all slots
-  for (size_t i = 0; i < count; i++) {
-    mi_delay_slot_t* slot = &slots[i];
+  size_t newcount = 0;
+  
+  // walk through all valid slots
+  for (size_t i = 0; i < ds->count; i++) {
+    mi_delay_slot_t* slot = &ds->slots[i];
     if (slot->addr <= addr && slot->addr + slot->size >= addr + size) {
       // earlier slot encompasses the area; remove it
       slot->expire = 0;
@@ -510,12 +525,17 @@ static bool mi_delay_remove(mi_delay_slot_t* slots, size_t count, void* p, size_
       slot->expire = 0;
     }
     else if ((addr <= slot->addr && addr + size > slot->addr) ||
-             (addr < slot->addr + slot->size && addr + size >= slot->addr + slot->size)) {
-      // partial overlap, remove slot
-      mi_assert_internal(false); 
+      (addr < slot->addr + slot->size && addr + size >= slot->addr + slot->size)) {
+      // partial overlap
+      // can happen with a large object spanning onto some partial end block
+      // mi_assert_internal(false);
       slot->expire = 0;
     }
+    else {
+      newcount = i + 1;
+    }
   }
+  ds->count = newcount;
   return done;
 }
 
@@ -525,13 +545,13 @@ static void mi_resolve_reset(void* p, size_t size, void* vtld) {
 }
 
 bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  mi_delay_insert(tld->reset_delay, MI_RESET_DELAY_SLOTS, mi_option_get(mi_option_reset_delay),
+  mi_delay_insert(tld->reset_delay, mi_option_get(mi_option_reset_delay),
     (uint8_t*)p, size, &mi_resolve_reset, tld);
   return true;
 }
 
 bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  if (!mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, (uint8_t*)p, size)) {
+  if (!mi_delay_remove(tld->reset_delay, (uint8_t*)p, size)) {
     return _mi_os_unreset(p, size, is_zero, tld->stats);
   }
   return true;
@@ -544,12 +564,12 @@ bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
 -----------------------------------------------------------------------------*/
 
 bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+  mi_delay_remove(tld->reset_delay,p, size);
   return _mi_os_commit(p, size, is_zero, tld->stats);
 }
 
 bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
-  mi_delay_remove(tld->reset_delay, MI_RESET_DELAY_SLOTS, p, size);
+  mi_delay_remove(tld->reset_delay, p, size);
   return _mi_os_decommit(p, size, tld->stats);
 }
 
diff --git a/src/options.c b/src/options.c
index 81ffe88b..ff96c95b 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 1, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
diff --git a/src/segment.c b/src/segment.c
index 49dab6ba..549dd339 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -504,7 +504,9 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_seg
   _mi_stat_decrease(&tld->stats->pages, 1);
   
   // reset the page memory to reduce memory pressure?
-  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) 
+       // && segment->page_kind <= MI_PAGE_MEDIUM) // to prevent partial overlapping resets
+  {
     size_t psize;
     uint8_t* start = _mi_page_start(segment, page, &psize);
     page->is_reset = true;

From 165ee4584597aebdb1a45fcd4e8b3904b6f7d396 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 11 Nov 2019 17:31:48 -0800
Subject: [PATCH 39/48] initialize delay slots for the main thread

---
 src/init.c    | 4 +++-
 src/options.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/init.c b/src/init.c
index c9700cd5..5967b4b9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -96,11 +96,13 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
 
+static mi_delay_slots_t tld_reset_delay_main = { MI_RESET_DELAY_SLOTS, 0, { {0,NULL,0} } };
+
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
-  { 0, NULL, tld_main_stats },  // os
+  { 0, &tld_reset_delay_main, tld_main_stats },  // os
   { MI_STATS_NULL }             // stats
 };
 
diff --git a/src/options.c b/src/options.c
index ff96c95b..81ffe88b 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 1, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds

From ef179a63770d8e17f105303a08ddfdd57085b936 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Tue, 12 Nov 2019 10:16:59 -0800
Subject: [PATCH 40/48] avoid allocation at numa node detection on linux

---
 include/mimalloc-internal.h | 37 +++++++++++++++------
 src/os.c                    | 65 +++++++++++++++++--------------------
 2 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 6bfabe27..668a7bd3 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -17,18 +17,18 @@ terms of the MIT license. A copy of the license can be found in the file
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
-#define mi_trace_message(...)  
+#define mi_trace_message(...)
 #endif
 
 #if defined(_MSC_VER)
 #define mi_decl_noinline   __declspec(noinline)
-#define mi_attr_noreturn 
+#define mi_attr_noreturn
 #elif defined(__GNUC__) || defined(__clang__)
 #define mi_decl_noinline   __attribute__((noinline))
 #define mi_attr_noreturn   __attribute__((noreturn))
 #else
 #define mi_decl_noinline
-#define mi_attr_noreturn   
+#define mi_attr_noreturn
 #endif
 
 
@@ -56,8 +56,6 @@ void       _mi_os_init(void);                                      // called fro
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
-int        _mi_os_numa_node(mi_os_tld_t* tld);
-int        _mi_os_numa_node_count(void);
 
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
@@ -146,8 +144,8 @@ bool        _mi_page_is_valid(mi_page_t* page);
   Inlined definitions
 ----------------------------------------------------------- */
 #define UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0) 
-#define UNUSED_RELEASE(x)  
+#if (MI_DEBUG>0)
+#define UNUSED_RELEASE(x)
 #else
 #define UNUSED_RELEASE(x)  UNUSED(x)
 #endif
@@ -398,7 +396,7 @@ static inline mi_block_t* mi_block_nextx( uintptr_t cookie, const mi_block_t* bl
   #endif
 }
 
-static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) {  
+static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) {
   #ifdef MI_ENCODE_FREELIST
   block->next = (mi_encoded_t)next ^ cookie;
   #else
@@ -411,12 +409,12 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
   #ifdef MI_ENCODE_FREELIST
   mi_block_t* next = mi_block_nextx(page->cookie,block);
   // check for free list corruption: is `next` at least in our segment range?
-  // TODO: it is better to check if it is actually inside our page but that is more expensive 
+  // TODO: it is better to check if it is actually inside our page but that is more expensive
   // to calculate. Perhaps with a relative free list this becomes feasible?
   if (next!=NULL && !mi_is_in_same_segment(block, next)) {
     _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
     next = NULL;
-  }   
+  }
   return next;
   #else
   UNUSED(page);
@@ -433,6 +431,25 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
   #endif
 }
 
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int _mi_os_numa_node_get(mi_os_tld_t* tld);
+int _mi_os_numa_node_count_get(void);
+
+extern int _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(_mi_numa_node_count == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline int _mi_os_numa_node_count(void) {
+  if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
 // -------------------------------------------------------------------
 // Getting the thread id should be performant
 // as it is called in the fast path of `_mi_free`,
diff --git a/src/os.c b/src/os.c
index 5229381b..d6878927 100644
--- a/src/os.c
+++ b/src/os.c
@@ -786,9 +786,9 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
 
   mi_win_enable_large_os_pages();
-  
+
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };  
+  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   static bool mi_huge_pages_available = true;
   if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
@@ -818,7 +818,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
   if (pVirtualAlloc2 != NULL && numa_node >= 0) {
     params[0].Type = MemExtendedParameterNumaNode;
-    params[0].ULong = (unsigned)numa_node;    
+    params[0].ULong = (unsigned)numa_node;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
   #endif
@@ -838,7 +838,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
   #ifdef MI_HAS_NUMA
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     uintptr_t numa_mask = (1UL << numa_node);
-    // TODO: does `mbind` work correctly for huge OS pages? should we 
+    // TODO: does `mbind` work correctly for huge OS pages? should we
     // use `set_mempolicy` before calling mmap instead?
     // see: <https://lkml.org/lkml/2017/2/9/875>
     long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
@@ -857,7 +857,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 }
 #endif
 
-#if (MI_INTPTR_SIZE >= 8) 
+#if (MI_INTPTR_SIZE >= 8)
 // To ensure proper alignment, use our own area for huge OS pages
 static _Atomic(uintptr_t)  mi_huge_start; // = 0
 
@@ -900,7 +900,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   size_t size = 0;
   uint8_t* start = mi_os_claim_huge_pages(pages, &size);
   if (start == NULL) return NULL; // or 32-bit systems
-  
+
   // Allocate one page at the time but try to place them contiguously
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
@@ -920,11 +920,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       }
       break;
     }
-    
+
     // success, record it
     _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
-    
+
     // check for timeout
     if (max_msecs > 0) {
       mi_msecs_t elapsed = _mi_clock_end(start_t);
@@ -958,7 +958,7 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 }
 
 /* ----------------------------------------------------------------------------
-Support NUMA aware allocation 
+Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 #ifdef WIN32
 static int mi_os_numa_nodex() {
@@ -975,9 +975,8 @@ static int mi_os_numa_node_countx(void) {
   return (int)(numa_max + 1);
 }
 #elif defined(__linux__)
-#include <dirent.h>
-#include <stdlib.h>
-#include <sys/syscall.h>
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access
 
 static int mi_os_numa_nodex(void) {
 #ifdef SYS_getcpu
@@ -990,22 +989,15 @@ static int mi_os_numa_nodex(void) {
   return 0;
 #endif
 }
-
 static int mi_os_numa_node_countx(void) {
-  DIR* d = opendir("/sys/devices/system/node");
-  if (d==NULL) return 1;
-  
-  struct dirent* de;
-  int max_node_num = 0;
-  while ((de = readdir(d)) != NULL) {
-  	int node_num;
-  	if (strncmp(de->d_name, "node", 4) == 0) {
-		  node_num = (int)strtol(de->d_name+4, NULL, 0);
-			if (max_node_num < node_num) max_node_num = node_num;
-    }
+  char buf[128];
+  int max_node = mi_option_get(mi_option_max_numa_node);
+  int node = 0;
+  for(node = 0; node < max_node; node++) {
+    snprintf(buf, 127, "/sys/devices/system/node/node%i", node + 1);
+    if (access(buf,R_OK) != 0) break;
   }
-  closedir(d);
-  return (max_node_num + 1);
+  return (node+1);
 }
 #else
 static int mi_os_numa_nodex(void) {
@@ -1016,29 +1008,30 @@ static int mi_os_numa_node_countx(void) {
 }
 #endif
 
-int _mi_os_numa_node_count(void) {
-  static int numa_node_count = 0;   // cache the node count 
-  if (mi_unlikely(numa_node_count <= 0)) {
-    int ncount = mi_os_numa_node_countx();    
+int _mi_numa_node_count = 0;   // cache the node count
+
+int _mi_os_numa_node_count_get(void) {
+  if (mi_unlikely(_mi_numa_node_count <= 0)) {
+    int ncount = mi_os_numa_node_countx();
     int ncount0 = ncount;
     // never more than max numa node and at least 1
     int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
     if (ncount > nmax) ncount = nmax;
     if (ncount <= 0)   ncount = 1;
-    numa_node_count = ncount;
-    _mi_verbose_message("using %i numa regions (%i nodes detected)\n", numa_node_count, ncount0);
+    _mi_numa_node_count = ncount;
+    _mi_verbose_message("using %i numa regions (%i nodes detected)\n", _mi_numa_node_count, ncount0);
   }
-  mi_assert_internal(numa_node_count >= 1);
-  return numa_node_count;
+  mi_assert_internal(_mi_numa_node_count >= 1);
+  return _mi_numa_node_count;
 }
 
-int _mi_os_numa_node(mi_os_tld_t* tld) {
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
   UNUSED(tld);
   int numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
   int numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  if (numa_node < 0) numa_node = 0;  
+  if (numa_node < 0) numa_node = 0;
   return numa_node;
 }

From af746ca4c1682e29dd42e8c0e6fa6db6aa04b200 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Tue, 12 Nov 2019 10:17:39 -0800
Subject: [PATCH 41/48] inline bitmap_mask

---
 src/bitmap.inc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 3847e712..81f87a79 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -8,11 +8,11 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ----------------------------------------------------------------------------
 This file is meant to be included in other files for efficiency.
 It implements a bitmap that can set/reset sequences of bits atomically
-and is used to concurrently claim memory ranges. 
+and is used to concurrently claim memory ranges.
 
 A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
 
-A current limitation is that the bit sequences cannot cross fields 
+A current limitation is that the bit sequences cannot cross fields
 and that the sequence must be smaller or equal to the bits in a field.
 ---------------------------------------------------------------------------- */
 #pragma once
@@ -59,7 +59,7 @@ static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
 
 
 // The bit mask for a given number of blocks at a specified bit index.
-static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
   if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
   return ((((uintptr_t)1 << count) - 1) << bitidx);
@@ -104,10 +104,10 @@ static inline size_t mi_bsr(uintptr_t x) {
   Claim a bit sequence atomically
 ----------------------------------------------------------- */
 
-// Try to atomically claim a sequence of `count` bits in a single 
+// Try to atomically claim a sequence of `count` bits in a single
 // field at `idx` in `bitmap`. Returns `true` on success.
-static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
-{  
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
   mi_assert_internal(bitmap_idx != NULL);
   volatile _Atomic(uintptr_t)* field = &bitmap[idx];
   uintptr_t map  = mi_atomic_read(field);
@@ -136,7 +136,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
         continue;
       }
       else {
-        // success, we claimed the bits!        
+        // success, we claimed the bits!
         *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
         return true;
       }
@@ -205,4 +205,4 @@ static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields
   return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
 }
 
-#endif
\ No newline at end of file
+#endif

From 867d78f877474c7f36fd19bc2ea62918f117f068 Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Tue, 12 Nov 2019 10:19:52 -0800
Subject: [PATCH 42/48] reserve huge OS pages earlier on at process_init

---
 src/init.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/init.c b/src/init.c
index 5967b4b9..473e9a32 100644
--- a/src/init.c
+++ b/src/init.c
@@ -19,7 +19,7 @@ const mi_page_t _mi_page_empty = {
   0,
   #endif
   0,       // used
-  NULL, 
+  NULL,
   ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
   0, NULL, NULL, NULL
   #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
@@ -246,7 +246,7 @@ static bool _mi_heap_done(void) {
   // switch to backing heap and free it
   heap = heap->tld->heap_backing;
   if (!mi_heap_is_initialized(heap)) return false;
-  
+
   // collect if not the main thread
   if (heap != &_mi_heap_main) {
     _mi_heap_collect_abandon(heap);
@@ -394,7 +394,7 @@ bool mi_is_redirected() mi_attr_noexcept {
 }
 
 // Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) 
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -440,11 +440,6 @@ static void mi_process_load(void) {
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,msg);
   }
-
-  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);    
-    mi_reserve_huge_os_pages_interleave(pages, pages*500);
-  }
 }
 
 // Initialize the process; called by thread_init or the process loader
@@ -471,6 +466,11 @@ void mi_process_init(void) mi_attr_noexcept {
   #endif
   mi_thread_init();
   mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
+    mi_reserve_huge_os_pages_interleave(pages, pages*500);
+  }
 }
 
 // Called when the process is done (through `at_exit`)
@@ -497,7 +497,7 @@ static void mi_process_done(void) {
 
 
 #if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done  
+  // Windows DLL: easy to hook into process_init and thread_done
   __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
     UNUSED(reserved);
     UNUSED(inst);

From d4f54dcf3049bd958ee262cbd9b3b0c7134d59ed Mon Sep 17 00:00:00 2001
From: daan <daan@microsoft.com>
Date: Tue, 12 Nov 2019 10:37:15 -0800
Subject: [PATCH 43/48] remove numaif dependency on linux

---
 CMakeLists.txt | 11 -----------
 src/os.c       | 21 ++++++++++++++-------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18bdea5a..a2258128 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,6 @@ option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanis
 option(MI_BUILD_TESTS       "Build test executables" ON)
 
 include("cmake/mimalloc-config-version.cmake")
-include("CheckIncludeFile")
 
 set(mi_install_dir "lib/mimalloc-${mi_version}")
 
@@ -98,16 +97,6 @@ if(MI_USE_CXX MATCHES "ON")
   set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX )
 endif()
 
-CHECK_INCLUDE_FILE("numaif.h" MI_HAVE_NUMA_H)
-if(MI_HAVE_NUMA_H)
-  list(APPEND mi_defines MI_HAS_NUMA)
-  list(APPEND mi_libraries numa)
-else()
-  if (NOT(WIN32))
-    message(WARNING "Compiling without using NUMA optimized allocation (on Linux, install libnuma-dev?)")
-  endif()
-endif()
-
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
   list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas)
diff --git a/src/os.c b/src/os.c
index d6878927..7af7363b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -827,28 +827,35 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 }
 
 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
-#ifdef MI_HAS_NUMA
-#include <numaif.h> // mbind, and use -lnuma
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  UNUSED(start); UNUSED(len); UNUSED(mode); UNUSED(nmask); UNUSED(maxnode); UNUSED(flags);
+  return 0;
+}
 #endif
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
   mi_assert_internal(size%GiB == 0);
   bool is_large = true;
   void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
-  #ifdef MI_HAS_NUMA
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     uintptr_t numa_mask = (1UL << numa_node);
     // TODO: does `mbind` work correctly for huge OS pages? should we
     // use `set_mempolicy` before calling mmap instead?
     // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
     if (err != 0) {
       _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
     }
   }
-  #else
-  UNUSED(numa_node);
-  #endif
   return p;
 }
 #else

From bdb82748191ac5dbc436f0f62dcbebfd3df95157 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Tue, 12 Nov 2019 12:04:43 -0800
Subject: [PATCH 44/48] change max_numa_node to max_numa_nodes option

---
 include/mimalloc.h | 2 +-
 src/options.c      | 2 +-
 src/os.c           | 7 ++++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 67b17c73..8d029135 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -275,7 +275,7 @@ typedef enum mi_option_e {
   mi_option_eager_commit_delay,
   mi_option_reset_delay,
   mi_option_os_tag,
-  mi_option_max_numa_node,
+  mi_option_max_numa_nodes,
   mi_option_max_errors,
   _mi_option_last
 } mi_option_t;
diff --git a/src/options.c b/src/options.c
index 81ffe88b..bbea4e67 100644
--- a/src/options.c
+++ b/src/options.c
@@ -70,7 +70,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 256, UNINIT, MI_OPTION(max_numa_node) },     // maximum allowed numa node
+  { 256, UNINIT, MI_OPTION(max_numa_nodes) },    // use at most N numa nodes
   { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
 };
 
diff --git a/src/os.c b/src/os.c
index 7af7363b..93fb8b31 100644
--- a/src/os.c
+++ b/src/os.c
@@ -998,9 +998,10 @@ static int mi_os_numa_nodex(void) {
 }
 static int mi_os_numa_node_countx(void) {
   char buf[128];
-  int max_node = mi_option_get(mi_option_max_numa_node);
+  int max_nodes = mi_option_get(mi_option_max_numa_nodes); // set to 0 to disable detection (and NUMA awareness)
   int node = 0;
-  for(node = 0; node < max_node; node++) {
+  for(node = 0; node < max_nodes; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
     snprintf(buf, 127, "/sys/devices/system/node/node%i", node + 1);
     if (access(buf,R_OK) != 0) break;
   }
@@ -1022,7 +1023,7 @@ int _mi_os_numa_node_count_get(void) {
     int ncount = mi_os_numa_node_countx();
     int ncount0 = ncount;
     // never more than max numa node and at least 1
-    int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
+    int nmax = (int)mi_option_get(mi_option_max_numa_nodes);
     if (ncount > nmax) ncount = nmax;
     if (ncount <= 0)   ncount = 1;
     _mi_numa_node_count = ncount;

From d01ed42bcb755ed6c1b52bfd8a306821da098dd5 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Wed, 13 Nov 2019 13:35:50 -0800
Subject: [PATCH 45/48] replace max_numa_nodes by use_numa_nodes (to help with
 wrong detection of numa nodes on WSL for example)

---
 include/mimalloc-internal.h |  8 +++---
 include/mimalloc.h          |  4 +--
 src/arena.c                 | 15 +++++------
 src/init.c                  |  2 +-
 src/options.c               |  4 +--
 src/os.c                    | 54 +++++++++++++++++--------------------
 6 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 668a7bd3..77045a99 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -436,15 +436,15 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
 // Optimize numa node access for the common case (= one node)
 // -------------------------------------------------------------------
 
-int _mi_os_numa_node_get(mi_os_tld_t* tld);
-int _mi_os_numa_node_count_get(void);
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
 
-extern int _mi_numa_node_count;
+extern size_t _mi_numa_node_count;
 static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
   if (mi_likely(_mi_numa_node_count == 1)) return 0;
   else return _mi_os_numa_node_get(tld);
 }
-static inline int _mi_os_numa_node_count(void) {
+static inline size_t _mi_os_numa_node_count(void) {
   if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
   else return _mi_os_numa_node_count_get();
 }
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 8d029135..3c942849 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -230,7 +230,7 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
 
-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
 // deprecated
@@ -274,8 +274,8 @@ typedef enum mi_option_e {
   mi_option_segment_reset,
   mi_option_eager_commit_delay,
   mi_option_reset_delay,
+  mi_option_use_numa_nodes,
   mi_option_os_tag,
-  mi_option_max_numa_nodes,
   mi_option_max_errors,
   _mi_option_last
 } mi_option_t;
diff --git a/src/arena.c b/src/arena.c
index 02890bd6..46741208 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,7 +42,6 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_sec
 void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
 
 bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); 
-int   _mi_os_numa_node_count(void);
 
 /* -----------------------------------------------------------
   Arena allocation
@@ -317,22 +316,22 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
 }
 
 
-// reserve huge pages evenly among all numa nodes. 
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept {
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
   if (pages == 0) return 0;
 
   // pages per numa node
-  int numa_count = _mi_os_numa_node_count();
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
   if (numa_count <= 0) numa_count = 1;
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
   const size_t timeout_per = (timeout_msecs / numa_count) + 50;
   
   // reserve evenly among numa nodes
-  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
-    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
@@ -349,7 +348,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   UNUSED(max_secs);
   _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, (size_t)(max_secs * 1000.0));  
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));  
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
diff --git a/src/init.c b/src/init.c
index 473e9a32..72543b95 100644
--- a/src/init.c
+++ b/src/init.c
@@ -469,7 +469,7 @@ void mi_process_init(void) mi_attr_noexcept {
   
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
-    mi_reserve_huge_os_pages_interleave(pages, pages*500);
+    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
   }
 }
 
diff --git a/src/options.c b/src/options.c
index bbea4e67..180f6a75 100644
--- a/src/options.c
+++ b/src/options.c
@@ -69,9 +69,9 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
   { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 256, UNINIT, MI_OPTION(max_numa_nodes) },    // use at most N numa nodes
-  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/os.c b/src/os.c
index 93fb8b31..2415a40d 100644
--- a/src/os.c
+++ b/src/os.c
@@ -968,66 +968,61 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 #ifdef WIN32
-static int mi_os_numa_nodex() {
+static size_t mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
   USHORT numa_node = 0;
   GetCurrentProcessorNumberEx(&pnum);
   GetNumaProcessorNodeEx(&pnum,&numa_node);
-  return (int)numa_node;
+  return numa_node;
 }
 
-static int mi_os_numa_node_countx(void) {
+static size_t mi_os_numa_node_countx(void) {
   ULONG numa_max = 0;
   GetNumaHighestNodeNumber(&numa_max);
-  return (int)(numa_max + 1);
+  return (numa_max + 1);
 }
 #elif defined(__linux__)
 #include <sys/syscall.h>  // getcpu
 #include <stdio.h>        // access
 
-static int mi_os_numa_nodex(void) {
+static size_t mi_os_numa_nodex(void) {
 #ifdef SYS_getcpu
-  unsigned node = 0;
-  unsigned ncpu = 0;
-  int err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
   if (err != 0) return 0;
-  return (int)node;
+  return node;
 #else
   return 0;
 #endif
 }
-static int mi_os_numa_node_countx(void) {
+static size_t mi_os_numa_node_countx(void) {
   char buf[128];
-  int max_nodes = mi_option_get(mi_option_max_numa_nodes); // set to 0 to disable detection (and NUMA awareness)
-  int node = 0;
-  for(node = 0; node < max_nodes; node++) {
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
     // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
-    snprintf(buf, 127, "/sys/devices/system/node/node%i", node + 1);
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
     if (access(buf,R_OK) != 0) break;
   }
   return (node+1);
 }
 #else
-static int mi_os_numa_nodex(void) {
+static size_t mi_os_numa_nodex(void) {
   return 0;
 }
-static int mi_os_numa_node_countx(void) {
+static size_t mi_os_numa_node_countx(void) {
   return 1;
 }
 #endif
 
-int _mi_numa_node_count = 0;   // cache the node count
+size_t _mi_numa_node_count = 0;   // cache the node count
 
-int _mi_os_numa_node_count_get(void) {
+size_t _mi_os_numa_node_count_get(void) {
   if (mi_unlikely(_mi_numa_node_count <= 0)) {
-    int ncount = mi_os_numa_node_countx();
-    int ncount0 = ncount;
-    // never more than max numa node and at least 1
-    int nmax = (int)mi_option_get(mi_option_max_numa_nodes);
-    if (ncount > nmax) ncount = nmax;
-    if (ncount <= 0)   ncount = 1;
-    _mi_numa_node_count = ncount;
-    _mi_verbose_message("using %i numa regions (%i nodes detected)\n", _mi_numa_node_count, ncount0);
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount <= 0) ncount = (long)mi_os_numa_node_countx();        // or detect dynamically
+    _mi_numa_node_count = (size_t)(ncount <= 0 ? 1 : ncount);
+    _mi_verbose_message("using %zd numa regions\n", _mi_numa_node_count);
   }
   mi_assert_internal(_mi_numa_node_count >= 1);
   return _mi_numa_node_count;
@@ -1035,11 +1030,10 @@ int _mi_os_numa_node_count_get(void) {
 
 int _mi_os_numa_node_get(mi_os_tld_t* tld) {
   UNUSED(tld);
-  int numa_count = _mi_os_numa_node_count();
+  size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  int numa_node = mi_os_numa_nodex();
+  size_t numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  if (numa_node < 0) numa_node = 0;
-  return numa_node;
+  return (int)numa_node;
 }

From 30e2c54adba9f1d2ef32e35e4e6c4b80e5732c26 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Wed, 20 Nov 2019 14:13:02 -0800
Subject: [PATCH 46/48] remove delayed reset option (for now)

---
 include/mimalloc.h |   2 +-
 src/memory.c       | 139 ++------------------------
 src/options.c      |   4 +-
 src/os.c           | 237 +++++++++++++++++++++++++--------------------
 4 files changed, 142 insertions(+), 240 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 3c942849..a59b9cf7 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -273,7 +273,7 @@ typedef enum mi_option_e {
   mi_option_page_reset,
   mi_option_segment_reset,
   mi_option_eager_commit_delay,
-  mi_option_reset_delay,
+  mi_option_reset_decommits,
   mi_option_use_numa_nodes,
   mi_option_os_tag,
   mi_option_max_errors,
diff --git a/src/memory.c b/src/memory.c
index f3052d6b..b0bcf7a0 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -53,9 +53,6 @@ void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
 void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 
-// local
-static bool mi_delay_remove(mi_delay_slots_t* delay_slots, void* p, size_t size);
-
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
@@ -354,8 +351,6 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld) {
   if (p==NULL) return;
   if (size==0) return;
 
-  mi_delay_remove(tld->reset_delay, p, size);
-
   size_t arena_memid = 0;
   mi_bitmap_index_t bit_idx;
   mem_region_t* region;
@@ -424,7 +419,6 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
         bool is_eager_committed;
         void* start = mi_region_info_read(mi_atomic_read(&regions[i].info), NULL, &is_eager_committed);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          mi_delay_remove(tld->reset_delay, start, MI_REGION_SIZE);
           _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, tld->stats);
         }
         // and release
@@ -434,142 +428,23 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
   }
 }
 
-/* ----------------------------------------------------------------------------
-  Delay slots
------------------------------------------------------------------------------*/
-
-typedef void (mi_delay_resolve_fun)(void* addr, size_t size, void* arg);
-
-static void mi_delay_insert(mi_delay_slots_t* ds,
-  mi_msecs_t delay, uint8_t* addr, size_t size,
-  mi_delay_resolve_fun* resolve, void* arg)
-{
-  if (ds == NULL || delay==0 || addr==NULL || size==0) {
-    resolve(addr, size, arg);
-    return;
-  }
-
-  mi_msecs_t now = _mi_clock_now();
-  mi_delay_slot_t* oldest = &ds->slots[0];
-  // walk through all slots, resolving expired ones.
-  // remember the oldest slot to insert the new entry in.
-  size_t newcount = 0;
-  for (size_t i = 0; i < ds->count; i++) {
-    mi_delay_slot_t* slot = &ds->slots[i];
-    
-    if (slot->expire == 0) {
-      // empty slot
-      oldest = slot;
-    }
-    // TODO: should we handle overlapping areas too?
-    else if (slot->addr <= addr && slot->addr + slot->size >= addr + size) {
-      // earlier slot encompasses new area, increase expiration
-      slot->expire = now + delay;
-      delay = 0; 
-    }
-    else if (addr <= slot->addr && addr + size >= slot->addr + slot->size) {
-      // new one encompasses old slot, overwrite
-      slot->expire = now + delay;
-      slot->addr = addr;
-      slot->size = size;
-      delay = 0;
-    }
-    else if (slot->expire < now) {
-      // expired slot, resolve now
-      slot->expire = 0;
-      resolve(slot->addr, slot->size, arg);
-    }
-    else if (oldest->expire > slot->expire) {  
-      oldest = slot;
-      newcount = i+1;
-    }
-    else {
-      newcount = i+1;
-    }
-  }
-  ds->count = newcount;
-  if (delay>0) {
-    // not yet registered, use the oldest slot (or a new one if there is space)
-    if (ds->count < ds->capacity) {
-      oldest = &ds->slots[ds->count];
-      ds->count++;
-    }
-    else if (oldest->expire > 0) { 
-      resolve(oldest->addr, oldest->size, arg);  // evict if not empty
-    }
-    mi_assert_internal((oldest - ds->slots) < (ptrdiff_t)ds->count);
-    oldest->expire = now + delay;
-    oldest->addr = addr;
-    oldest->size = size;
-  }
-}
-
-static bool mi_delay_remove(mi_delay_slots_t* ds, void* p, size_t size)
-{
-  if (ds == NULL || p==NULL || size==0) return false; 
-  
-  uint8_t* addr = (uint8_t*)p;
-  bool done = false;
-  size_t newcount = 0;
-  
-  // walk through all valid slots
-  for (size_t i = 0; i < ds->count; i++) {
-    mi_delay_slot_t* slot = &ds->slots[i];
-    if (slot->addr <= addr && slot->addr + slot->size >= addr + size) {
-      // earlier slot encompasses the area; remove it
-      slot->expire = 0;
-      done = true;
-    }
-    else if (addr <= slot->addr && addr + size >= slot->addr + slot->size) {
-      // new one encompasses old slot, remove it
-      slot->expire = 0;
-    }
-    else if ((addr <= slot->addr && addr + size > slot->addr) ||
-      (addr < slot->addr + slot->size && addr + size >= slot->addr + slot->size)) {
-      // partial overlap
-      // can happen with a large object spanning onto some partial end block
-      // mi_assert_internal(false);
-      slot->expire = 0;
-    }
-    else {
-      newcount = i + 1;
-    }
-  }
-  ds->count = newcount;
-  return done;
-}
-
-static void mi_resolve_reset(void* p, size_t size, void* vtld) {
-  mi_os_tld_t* tld = (mi_os_tld_t*)vtld;
-  _mi_os_reset(p, size, tld->stats);
-}
-
-bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  mi_delay_insert(tld->reset_delay, mi_option_get(mi_option_reset_delay),
-    (uint8_t*)p, size, &mi_resolve_reset, tld);
-  return true;
-}
-
-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  if (!mi_delay_remove(tld->reset_delay, (uint8_t*)p, size)) {
-    return _mi_os_unreset(p, size, is_zero, tld->stats);
-  }
-  return true;
-}
-
-
 
 /* ----------------------------------------------------------------------------
   Other
 -----------------------------------------------------------------------------*/
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_reset(p, size, tld->stats);
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_unreset(p, size, is_zero, tld->stats);
+}
 
 bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  mi_delay_remove(tld->reset_delay,p, size);
   return _mi_os_commit(p, size, is_zero, tld->stats);
 }
 
 bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
-  mi_delay_remove(tld->reset_delay, p, size);
   return _mi_os_decommit(p, size, tld->stats);
 }
 
diff --git a/src/options.c b/src/options.c
index 180f6a75..8c4c1707 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,10 +65,10 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 1, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  { 500, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
+  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset uses decommit/commit
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
diff --git a/src/os.c b/src/os.c
index 2415a40d..02683a02 100644
--- a/src/os.c
+++ b/src/os.c
@@ -77,11 +77,11 @@ static bool use_large_os_page(size_t size, size_t alignment) {
 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
   size_t align_size;
-  if (size < 512*KiB) align_size = _mi_os_page_size();
-  else if (size < 2*MiB) align_size = 64*KiB;
-  else if (size < 8*MiB) align_size = 256*KiB;
-  else if (size < 32*MiB) align_size = 1*MiB;
-  else align_size = 4*MiB;
+  if (size < 512 * KiB) align_size = _mi_os_page_size();
+  else if (size < 2 * MiB) align_size = 64 * KiB;
+  else if (size < 8 * MiB) align_size = 256 * KiB;
+  else if (size < 32 * MiB) align_size = 1 * MiB;
+  else align_size = 4 * MiB;
   if (size >= (SIZE_MAX - align_size)) return size; // possible overflow?
   return _mi_align_up(size, align_size);
 }
@@ -92,8 +92,8 @@ size_t _mi_os_good_alloc_size(size_t size) {
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
 // We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
 #include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
+typedef PVOID(__stdcall* PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
+typedef NTSTATUS(__stdcall* PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
@@ -129,7 +129,7 @@ static bool mi_win_enable_large_os_pages()
     if (err == 0) err = GetLastError();
     _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
   }
-  return (ok!=0);
+  return (ok != 0);
 }
 
 void _mi_os_init(void) {
@@ -144,7 +144,7 @@ void _mi_os_init(void) {
   if (hDll != NULL) {
     // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
     pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    if (pVirtualAlloc2 == NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
     FreeLibrary(hDll);
   }
   hDll = LoadLibrary(TEXT("ntdll.dll"));
@@ -170,7 +170,7 @@ void _mi_os_init() {
     os_alloc_granularity = os_page_size;
   }
   if (mi_option_is_enabled(mi_option_large_os_pages)) {
-    large_os_page_size = 2*MiB;
+    large_os_page_size = 2 * MiB;
   }
 }
 #endif
@@ -210,7 +210,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
 #if (MI_INTPTR_SIZE >= 8)
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
+  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
     return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
   }
 #endif
@@ -233,7 +233,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
   if ((large_only || use_large_os_page(size, try_alignment))
-      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    && allow_large && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0) {
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
@@ -247,12 +247,12 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
       if (large_only) return p;
       // fall back to non-large page allocation on error (`p == NULL`).
       if (p == NULL) {
-        mi_atomic_write(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
+        mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
       }
     }
   }
   if (p == NULL) {
-    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    *is_large = ((flags & MEM_LARGE_PAGES) != 0);
     p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
   }
   if (p == NULL) {
@@ -264,8 +264,8 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
 #elif defined(__wasi__)
 static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
   uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
-  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
-  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
+  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t)try_alignment);
+  size_t alloc_size = _mi_align_up(aligned_base - base + size, _mi_os_page_size());
   mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
   if (alloc_size < size) return NULL;
   if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
@@ -278,47 +278,50 @@ static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
 #define MI_OS_USE_MMAP
 static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
   void* p = NULL;
-  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+#if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
   // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
   if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
-    p = mmap(hint,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
+    p = mmap(hint, size, protect_flags, flags, fd, 0);
+    if (p == MAP_FAILED) p = NULL; // fall back to regular mmap
   }
-  #else
+#else
   UNUSED(try_alignment);
-  #endif
-  if (p==NULL) {
-    p = mmap(addr,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL;
+#endif
+  if (p == NULL) {
+    p = mmap(addr, size, protect_flags, flags, fd, 0);
+    if (p == MAP_FAILED) p = NULL;
   }
   return p;
 }
 
 static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
   void* p = NULL;
-  #if !defined(MAP_ANONYMOUS)
-  #define MAP_ANONYMOUS  MAP_ANON
-  #endif
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#if !defined(MAP_ANONYMOUS)
+#define MAP_ANONYMOUS  MAP_ANON
+#endif
+#if !defined(MAP_NORESERVE)
+#define MAP_NORESERVE  0
+#endif
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
   int fd = -1;
-  #if defined(MAP_ALIGNED)  // BSD
+#if defined(MAP_ALIGNED)  // BSD
   if (try_alignment > 0) {
     size_t n = _mi_bsr(try_alignment);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
       flags |= MAP_ALIGNED(n);
     }
   }
-  #endif
-  #if defined(PROT_MAX)
+#endif
+#if defined(PROT_MAX)
   protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
-  #endif
-  #if defined(VM_MAKE_TAG)
-  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+#endif
+#if defined(VM_MAKE_TAG)
+// macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
   int os_tag = (int)mi_option_get(mi_option_os_tag);
   if (os_tag < 100 || os_tag > 255) os_tag = 100;
   fd = VM_MAKE_TAG(os_tag);
-  #endif
+#endif
   if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
     static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
@@ -332,39 +335,39 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
     else {
       int lflags = flags;
       int lfd = fd;
-      #ifdef MAP_ALIGNED_SUPER
+#ifdef MAP_ALIGNED_SUPER
       lflags |= MAP_ALIGNED_SUPER;
-      #endif
-      #ifdef MAP_HUGETLB
+#endif
+#ifdef MAP_HUGETLB
       lflags |= MAP_HUGETLB;
-      #endif
-      #ifdef MAP_HUGE_1GB
+#endif
+#ifdef MAP_HUGE_1GB
       static bool mi_huge_pages_available = true;
       if ((size % GiB) == 0 && mi_huge_pages_available) {
         lflags |= MAP_HUGE_1GB;
       }
       else
-      #endif
+#endif
       {
-        #ifdef MAP_HUGE_2MB
+#ifdef MAP_HUGE_2MB
         lflags |= MAP_HUGE_2MB;
-        #endif
+#endif
       }
-      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+#ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
       lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-      #endif
+#endif
       if (large_only || lflags != flags) {
         // try large OS page allocation
         *is_large = true;
         p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        #ifdef MAP_HUGE_1GB
+#ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
           p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
         }
-        #endif
+#endif
         if (large_only) return p;
         if (p == NULL) {
           mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
@@ -375,7 +378,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   if (p == NULL) {
     *is_large = false;
     p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-    #if defined(MADV_HUGEPAGE)
+#if defined(MADV_HUGEPAGE)
     // Many Linux systems don't allow MAP_HUGETLB but they support instead
     // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
     // though since properly aligned allocations will already use large pages if available
@@ -387,7 +390,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
         *is_large = true; // possibly
       };
     }
-    #endif
+#endif
   }
   return p;
 }
@@ -401,18 +404,18 @@ static volatile _Atomic(intptr_t) aligned_base;
 // Return a 4MiB aligned address that is probably available
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
+  if ((size % MI_SEGMENT_SIZE) != 0) return NULL;
   intptr_t hint = mi_atomic_add(&aligned_base, size);
-  if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
+  if (hint == 0 || hint > ((intptr_t)30 << 40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
     intptr_t init = ((intptr_t)4 << 40); // start at 4TiB area
-    #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
+#if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
     uintptr_t r = _mi_random_init((uintptr_t)&mi_os_get_aligned_hint ^ hint);
-    init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
-    #endif
+    init = init + (MI_SEGMENT_SIZE * ((r >> 17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
+#endif
     mi_atomic_cas_strong(mi_atomic_cast(uintptr_t, &aligned_base), init, hint + size);
     hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
   }
-  if (hint%try_alignment != 0) return NULL;
+  if (hint % try_alignment != 0) return NULL;
   return (void*)hint;
 }
 #else
@@ -441,17 +444,17 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
   }
   */
 
-  #if defined(_WIN32)
-    int flags = MEM_RESERVE;
-    if (commit) flags |= MEM_COMMIT;
-    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-  #elif defined(__wasi__)
-    *is_large = false;
-    p = mi_wasm_heap_grow(size, try_alignment);
-  #else
-    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
-  #endif
+#if defined(_WIN32)
+  int flags = MEM_RESERVE;
+  if (commit) flags |= MEM_COMMIT;
+  p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+#elif defined(__wasi__)
+  *is_large = false;
+  p = mi_wasm_heap_grow(size, try_alignment);
+#else
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+  p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+#endif
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
@@ -561,7 +564,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
     allow_large = *large;
     *large = false;
   }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), tld->stats);
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large != NULL ? large : &allow_large), tld->stats);
 }
 
 
@@ -613,7 +616,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     _mi_stat_decrease(&stats->committed, csize);
   }
 
-  #if defined(_WIN32)
+#if defined(_WIN32)
   if (commit) {
     // if the memory was already committed, the call succeeds but it is not zero'd
     // *is_zero = true;
@@ -624,28 +627,42 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
     err = (ok ? 0 : GetLastError());
   }
-  #elif defined(__wasi__)
+#elif defined(__wasi__)
   // WebAssembly guests can't control memory protection
-  #else
+#elif defined(MAP_FIXED)
+  if (!commit) {
+    // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
+    void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
+    if (p != start) { err = errno; }
+  }
+  else {
+    // for commit, just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
+#else
   err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
   if (err != 0) { err = errno; }
-  #endif
+#endif
   if (err != 0) {
-    _mi_warning_message("commit/decommit error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+    _mi_warning_message("%s error: start: 0x%p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
   }
   mi_assert_internal(err == 0);
   return (err == 0);
 }
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, false /* conservative? */, is_zero, stats);
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
 }
 
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
   bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative? */, &is_zero, stats);
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
 }
 
+bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return mi_os_commitx(addr, size, true, true /* conservative */, is_zero, stats);
+}
 
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
@@ -657,24 +674,24 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
   if (reset) _mi_stat_increase(&stats->reset, csize);
-        else _mi_stat_decrease(&stats->reset, csize);
+  else _mi_stat_decrease(&stats->reset, csize);
   if (!reset) return true; // nothing to do on unreset!
 
-  #if (MI_DEBUG>1)
-  if (MI_SECURE==0) {
+#if (MI_DEBUG>1)
+  if (MI_SECURE == 0) {
     memset(start, 0, csize); // pretend it is eagerly reset
   }
-  #endif
+#endif
 
 #if defined(_WIN32)
   // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
   void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
   mi_assert_internal(p == start);
-  #if 1
+#if 1
   if (p == start && start != NULL) {
-    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+    VirtualUnlock(start, csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
   }
-  #endif
+#endif
   if (p != start) return false;
 #else
 #if defined(MADV_FREE)
@@ -704,12 +721,22 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
 bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
-  return mi_os_resetx(addr, size, true, stats);
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_decommit(addr, size, stats);
+  }
+  else {
+    return mi_os_resetx(addr, size, true, stats);
+  }
 }
 
 bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  *is_zero = false;
-  return mi_os_resetx(addr, size, false, stats);
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
+  }
+  else {
+    *is_zero = false;
+    return mi_os_resetx(addr, size, false, stats);
+  }
 }
 
 
@@ -721,7 +748,7 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   if (csize == 0) return false;
   /*
   if (_mi_os_is_huge_reserved(addr)) {
-	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+    _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
   */
   int err = 0;
@@ -753,7 +780,7 @@ bool _mi_os_unprotect(void* addr, size_t size) {
 
 bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
   // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize && p != NULL);
+  mi_assert_internal(oldsize > newsize&& p != NULL);
   if (oldsize < newsize || p == NULL) return false;
   if (oldsize == newsize) return true;
 
@@ -781,20 +808,20 @@ and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 #if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 {
-  mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(size % GiB == 0);
   mi_assert_internal(addr != NULL);
   const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
 
   mi_win_enable_large_os_pages();
 
-  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   static bool mi_huge_pages_available = true;
   if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
+#ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+#define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
+#endif
     params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
     params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
     ULONG param_count = 1;
@@ -821,7 +848,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
     params[0].ULong = (unsigned)numa_node;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
-  #endif
+#endif
   // otherwise use regular virtual alloc on older windows
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
@@ -842,16 +869,16 @@ static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, cons
 }
 #endif
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(size % GiB == 0);
   bool is_large = true;
   void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+  if (numa_node >= 0 && numa_node < 8 * MI_INTPTR_SIZE) { // at most 64 nodes
     uintptr_t numa_mask = (1UL << numa_node);
     // TODO: does `mbind` work correctly for huge OS pages? should we
     // use `set_mempolicy` before calling mmap instead?
     // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8 * MI_INTPTR_SIZE, 0);
     if (err != 0) {
       _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
     }
@@ -883,7 +910,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
       start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
 #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
       uintptr_t r = _mi_random_init((uintptr_t)&mi_os_claim_huge_pages);
-      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r >> 17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
 #endif
     }
     end = start + size;
@@ -936,8 +963,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
     if (max_msecs > 0) {
       mi_msecs_t elapsed = _mi_clock_end(start_t);
       if (page >= 1) {
-        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
-        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+        mi_msecs_t estimate = ((elapsed / (page + 1)) * pages);
+        if (estimate > 2 * max_msecs) { // seems like we are going to timeout, break
           elapsed = max_msecs + 1;
         }
       }
@@ -947,7 +974,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       }
     }
   }
-  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  mi_assert_internal(page * MI_HUGE_OS_PAGE_SIZE <= size);
   if (pages_reserved != NULL) *pages_reserved = page;
   if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
   return (page == 0 ? NULL : start);
@@ -956,7 +983,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
 void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
-  if (p==NULL || size==0) return;
+  if (p == NULL || size == 0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
     _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
@@ -972,7 +999,7 @@ static size_t mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
   USHORT numa_node = 0;
   GetCurrentProcessorNumberEx(&pnum);
-  GetNumaProcessorNodeEx(&pnum,&numa_node);
+  GetNumaProcessorNodeEx(&pnum, &numa_node);
   return numa_node;
 }
 
@@ -999,12 +1026,12 @@ static size_t mi_os_numa_nodex(void) {
 static size_t mi_os_numa_node_countx(void) {
   char buf[128];
   unsigned node = 0;
-  for(node = 0; node < 256; node++) {
+  for (node = 0; node < 256; node++) {
     // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
     snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
-    if (access(buf,R_OK) != 0) break;
+    if (access(buf, R_OK) != 0) break;
   }
-  return (node+1);
+  return (node + 1);
 }
 #else
 static size_t mi_os_numa_nodex(void) {
@@ -1031,7 +1058,7 @@ size_t _mi_os_numa_node_count_get(void) {
 int _mi_os_numa_node_get(mi_os_tld_t* tld) {
   UNUSED(tld);
   size_t numa_count = _mi_os_numa_node_count();
-  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  if (numa_count <= 1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
   size_t numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }

From 211f1aa5190f063ee8eef237473281535c2be79f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Wed, 20 Nov 2019 14:55:12 -0800
Subject: [PATCH 47/48] remove reset delay slots; add reset tracking per page
 and segment

---
 include/mimalloc-internal.h |   8 +-
 include/mimalloc-types.h    |  28 +---
 include/mimalloc.h          |   3 +-
 src/arena.c                 |   8 +-
 src/bitmap.inc.c            |  54 ++++++--
 src/init.c                  |  11 +-
 src/memory.c                | 199 +++++++++++++++------------
 src/options.c               |   5 +-
 src/os.c                    | 204 ++++++++++++++--------------
 src/page.c                  |   7 +-
 src/segment.c               | 264 ++++++++++++++++++++++--------------
 11 files changed, 443 insertions(+), 348 deletions(-)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index d727e563..ab295e65 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -59,7 +59,7 @@ size_t     _mi_os_good_alloc_size(size_t size);
 
 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
-void       _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld);
+void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);
 
 bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
 bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
@@ -75,7 +75,7 @@ void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t*
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
@@ -297,7 +297,9 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 
 // Quick page start for initialized pages
 static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  return _mi_segment_page_start(segment, page, page->block_size, page_size);
+  const size_t bsize = page->block_size;
+  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
+  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
 }
 
 // Get the page containing the pointer
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 0ce91339..e816c3a6 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -384,31 +384,12 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
-
-// ------------------------------------------------------
-// Delay slots (to avoid expensive OS calls)
-// ------------------------------------------------------
-typedef int64_t mi_msecs_t;
-
-#define MI_RESET_DELAY_SLOTS (256)
-
-typedef struct mi_delay_slot_s {
-  mi_msecs_t expire;
-  uint8_t*   addr;
-  size_t     size;
-} mi_delay_slot_t;
-
-typedef struct mi_delay_slots_s {
-  size_t     capacity; // always `MI_RESET_DELAY_SLOTS`
-  size_t     count;    // current slots used (`<= capacity`)
-  mi_delay_slot_t slots[MI_RESET_DELAY_SLOTS];
-} mi_delay_slots_t;
-
-
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
 
+typedef int64_t  mi_msecs_t;
+
 // Queue of segments
 typedef struct mi_segment_queue_s {
   mi_segment_t* first;
@@ -417,9 +398,8 @@ typedef struct mi_segment_queue_s {
 
 // OS thread local data
 typedef struct mi_os_tld_s {
-  size_t              region_idx;   // start point for next allocation
-  mi_delay_slots_t*   reset_delay;  // delay slots for OS reset operations
-  mi_stats_t*         stats;        // points to tld stats
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
 } mi_os_tld_t;
 
 // Segments thread local data
diff --git a/include/mimalloc.h b/include/mimalloc.h
index a59b9cf7..197b1734 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -272,8 +272,9 @@ typedef enum mi_option_e {
   mi_option_segment_cache,
   mi_option_page_reset,
   mi_option_segment_reset,
-  mi_option_eager_commit_delay,
   mi_option_reset_decommits,
+  mi_option_eager_commit_delay,
+  mi_option_reset_delay,
   mi_option_use_numa_nodes,
   mi_option_os_tag,
   mi_option_max_errors,
diff --git a/src/arena.c b/src/arena.c
index 46741208..4a596b2c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -107,7 +107,7 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
   size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
   for (size_t visited = 0; visited < fcount; visited++, idx++) {
     if (idx >= fcount) idx = 0;  // wrap around
-    if (mi_bitmap_try_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
+    if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
       mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
       return true;
     }
@@ -137,9 +137,9 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
   }
   else if (commit) {
     // ensure commit now
-    bool any_zero;
-    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_zero);
-    if (any_zero) {
+    bool any_uncommitted;
+    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
       bool commit_zero;
       _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
       if (commit_zero) *is_zero = true;
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 81f87a79..11ada472 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -104,9 +104,29 @@ static inline size_t mi_bsr(uintptr_t x) {
   Claim a bit sequence atomically
 ----------------------------------------------------------- */
 
+// Try to atomically claim a sequence of `count` bits at in `idx`
+// in the bitmap field. Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_fields, const size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);
+
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if ((field & mask) == 0) { // free?
+    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
+      // claimed!
+      return true;
+    }
+  }
+  return false;
+}
+
+
 // Try to atomically claim a sequence of `count` bits in a single
 // field at `idx` in `bitmap`. Returns `true` on success.
-static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
   volatile _Atomic(uintptr_t)* field = &bitmap[idx];
@@ -160,9 +180,9 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
 
 // Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
 // For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+static inline bool mi_bitmap_try_find_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
   for (size_t idx = 0; idx < bitmap_fields; idx++) {
-    if (mi_bitmap_try_claim_field(bitmap, idx, count, bitmap_idx)) {
+    if (mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
       return true;
     }
   }
@@ -170,39 +190,51 @@ static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields,
 }
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously
+// Returns `true` if all `count` bits were 1 previously.
 static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  mi_assert_internal((bitmap[idx] & mask) == mask);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
   uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
   return ((prev & mask) == mask);
 }
 
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
   uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
   if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
   return ((prev & mask) == 0);
 }
 
-// Returns `true` if all `count` bits were 1
-static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == 0);
-  return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
 }
 
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+static inline bool mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
 #endif
diff --git a/src/init.c b/src/init.c
index f9735462..468fd46f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,13 +97,11 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
 
-static mi_delay_slots_t tld_reset_delay_main = { MI_RESET_DELAY_SLOTS, 0, { {0,NULL,0} } };
-
 static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
-  { 0, &tld_reset_delay_main, tld_main_stats },  // os
+  { 0, tld_main_stats },  // os
   { MI_STATS_NULL }             // stats
 };
 
@@ -194,8 +192,7 @@ uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {
 
 typedef struct mi_thread_data_s {
   mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-  mi_delay_slots_t reset_delay;
+  mi_tld_t   tld;  
 } mi_thread_data_t;
 
 // Initialize the thread local default heap, called from `mi_thread_init`
@@ -215,7 +212,6 @@ static bool _mi_heap_init(void) {
     }
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
-    mi_delay_slots_t* reset_delay = &td->reset_delay;
     memcpy(heap, &_mi_heap_empty, sizeof(*heap));
     heap->thread_id = _mi_thread_id();
     heap->random = _mi_random_init(heap->thread_id);
@@ -226,9 +222,6 @@ static bool _mi_heap_init(void) {
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
     tld->os.stats = &tld->stats;
-    tld->os.reset_delay = reset_delay;
-    memset(reset_delay, 0, sizeof(*reset_delay));
-    reset_delay->capacity = MI_RESET_DELAY_SLOTS;
     _mi_heap_set_default_direct(heap);
   }
   return false;
diff --git a/src/memory.c b/src/memory.c
index b0bcf7a0..94b6348f 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -54,6 +54,7 @@ void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, s
 void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
 
 
+
 // Constants
 #if (MI_INTPTR_SIZE==8)
 #define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 48KiB for the region map 
@@ -73,28 +74,26 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 // Region info is a pointer to the memory region and two bits for 
 // its flags: is_large, and is_committed.
-typedef uintptr_t mi_region_info_t;
-
-static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
-  return ((uintptr_t)start | ((uintptr_t)(is_large?1:0) << 1) | (is_committed?1:0));
-}
-
-static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, bool* is_committed) {
-  if (is_large) *is_large = ((info&0x02) != 0);
-  if (is_committed) *is_committed = ((info&0x01) != 0);
-  return (void*)(info & ~0x03);
-}
+typedef union mi_region_info_u {
+  uintptr_t value;
+  struct {
+    bool  valid;
+    bool  is_large;
+    int   numa_node;
+  };
+} mi_region_info_t;
 
 
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(mi_region_info_t) info;        // start of the memory area (and flags)
-  volatile _Atomic(uintptr_t)        numa_node;   // associated numa node + 1 (so 0 is no association)
+  volatile _Atomic(uintptr_t)        info;        // is_large, and associated numa node + 1 (so 0 is no association)
+  volatile _Atomic(void*)            start;       // start of the memory area (and flags)
   mi_bitmap_field_t                  in_use;      // bit per in-use block
   mi_bitmap_field_t                  dirty;       // track if non-zero per block
   mi_bitmap_field_t                  commit;      // track if committed per block (if `!info.is_committed))
-  size_t                             arena_memid; // if allocated from a (huge page) arena
+  mi_bitmap_field_t                  reset;       // track reset per block
+  volatile _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena-
 } mem_region_t;
 
 // The region map
@@ -113,24 +112,32 @@ static size_t mi_region_block_count(size_t size) {
   return _mi_divide_up(size, MI_SEGMENT_SIZE);
 }
 
+/*
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
 static size_t mi_good_commit_size(size_t size) {
   if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
   return _mi_align_up(size, _mi_os_large_page_size());
 }
+*/
 
 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   if (p==NULL) return false;
   size_t count = mi_atomic_read_relaxed(&regions_count);
   for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_region_info_read( mi_atomic_read_relaxed(&regions[i].info), NULL, NULL);
+    uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(&regions[i].start);
     if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
   }
   return false;
 }
 
 
+static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(start != NULL);
+  return ((uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE));  
+}
+
 static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
   mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
   size_t idx = region - regions;
@@ -142,13 +149,10 @@ static size_t mi_memid_create_from_arena(size_t arena_memid) {
   return (arena_memid << 1) | 1;
 }
 
-static bool mi_memid_is_arena(size_t id) {
-  return ((id&1)==1);
-}
 
-static bool mi_memid_indices(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
-  if (mi_memid_is_arena(id)) {
-    *arena_memid = (id>>1);
+static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
+  if ((id&1)==1) {
+    if (arena_memid != NULL) *arena_memid = (id>>1);
     return true;
   }
   else {
@@ -159,6 +163,7 @@ static bool mi_memid_indices(size_t id, mem_region_t** region, mi_bitmap_index_t
   }
 }
 
+
 /* ----------------------------------------------------------------------------
   Allocate a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
@@ -187,16 +192,21 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
 
   // allocated, initialize and claim the initial blocks
   mem_region_t* r = &regions[idx];
-  r->numa_node = _mi_os_numa_node(tld) + 1;
-  r->arena_memid = arena_memid;
+  r->arena_memid  = arena_memid;
   mi_atomic_write(&r->in_use, 0);
   mi_atomic_write(&r->dirty, (is_zero ? 0 : ~0UL));
   mi_atomic_write(&r->commit, (region_commit ? ~0UL : 0));
+  mi_atomic_write(&r->reset, 0);
   *bit_idx = 0;
   mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_write_ptr(&r->start, start);
 
   // and share it 
-  mi_atomic_write(&r->info, mi_region_info_create(start, region_large, region_commit)); // now make it available to others
+  mi_region_info_t info;
+  info.valid = true;
+  info.is_large = region_large;
+  info.numa_node = _mi_os_numa_node(tld);
+  mi_atomic_write(&r->info, info.value); // now make it available to others
   *region = r;
   return true;
 }
@@ -207,36 +217,33 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
 
 static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
   // initialized at all?
-  mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
-  if (info==0) return false;
+  mi_region_info_t info;
+  info.value = mi_atomic_read_relaxed(&region->info);
+  if (info.value==0) return false;
 
   // numa correct
   if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
+    int rnode = info.numa_node;
     if (rnode >= 0 && rnode != numa_node) return false;
   }
 
   // check allow-large
-  bool is_large;
-  bool is_committed;
-  mi_region_info_read(info, &is_large, &is_committed);  
-  if (!allow_large && is_large) return false;
+  if (!allow_large && info.is_large) return false;
 
   return true;
 }
 
 
-static bool mi_region_try_claim(size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
-  // try all regions for a free slot
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try all regions for a free slot  
   const size_t count = mi_atomic_read(&regions_count);
   size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
     mem_region_t* r = &regions[idx];
     if (mi_region_is_suitable(r, numa_node, allow_large)) {
-      if (mi_bitmap_try_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+      if (mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
         tld->region_idx = idx;    // remember the last found position
         *region = r;
         return true;
@@ -252,8 +259,9 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
   mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
   mem_region_t* region;
   mi_bitmap_index_t bit_idx;
-  // first try to claim in existing regions
-  if (!mi_region_try_claim(blocks, *is_large, &region, &bit_idx, tld)) {
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try to claim in existing regions
+  if (!mi_region_try_claim(numa_node, blocks, *is_large, &region, &bit_idx, tld)) {
     // otherwise try to allocate a fresh region
     if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
       // out of regions or memory
@@ -261,30 +269,28 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
     }
   }
   
+  
   // found a region and claimed `blocks` at `bit_idx`
   mi_assert_internal(region != NULL);
   mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
 
-  mi_region_info_t info = mi_atomic_read(&region->info);
-  bool region_is_committed = false;
-  bool region_is_large = false;
-  void* start = mi_region_info_read(info, &region_is_large, &region_is_committed);
-  mi_assert_internal(!(region_is_large && !*is_large));
+  mi_region_info_t info;
+  info.value = mi_atomic_read(&region->info);
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(!(info.is_large && !*is_large));
   mi_assert_internal(start != NULL);
 
-  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
-  *is_large = region_is_large;
+  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
+  *is_large = info.is_large;
   *memid = mi_memid_create(region, bit_idx);
   void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
-  if (region_is_committed) {
-    // always committed
-    *commit = true;
-  }
-  else if (*commit) {
+
+  // commit
+  if (*commit) {
     // ensure commit
-    bool any_zero;
-    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_zero);
-    if (any_zero) {
+    bool any_uncommitted;
+    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    if (any_uncommitted) {
       bool commit_zero;
       _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
       if (commit_zero) *is_zero = true;
@@ -294,6 +300,21 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
     // no need to commit, but check if already fully committed
     *commit = mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
   }  
+  mi_assert_internal(mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+
+  // unreset reset blocks
+  if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit); 
+    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
+    bool reset_zero;
+    _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
+    if (reset_zero) *is_zero = true;
+  }
+  mi_assert_internal(!mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+
+  #if (MI_DEBUG>=2)
+  if (*commit) { ((uint8_t*)p)[0] = 0; }
+  #endif
   
   // and return the allocation  
   mi_assert_internal(p != NULL);  
@@ -325,7 +346,9 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
     void* p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);
     mi_assert_internal(p == NULL || (uintptr_t)p % alignment == 0);    
     if (p != NULL) {
+      #if (MI_DEBUG>=2)
       if (*commit) { ((uint8_t*)p)[0] = 0; }
+      #endif
       return p;
     }
     _mi_warning_message("unable to allocate from region: size %zu\n", size);
@@ -346,56 +369,56 @@ Free
 -----------------------------------------------------------------------------*/
 
 // Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, mi_os_tld_t* tld) {
+void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
   mi_assert_internal(size > 0 && tld != NULL);
   if (p==NULL) return;
   if (size==0) return;
-
+  size = _mi_align_up(size, _mi_os_page_size());
+  
   size_t arena_memid = 0;
   mi_bitmap_index_t bit_idx;
   mem_region_t* region;
-  if (mi_memid_indices(id,&region,&bit_idx,&arena_memid)) {
+  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
     _mi_arena_free(p, size, arena_memid, tld->stats);
   }
   else {
     // allocated in a region
     mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
-    // we can align the size up to page size (as we allocate that way too)
-    // this ensures we fully commit/decommit/reset
-    size = _mi_align_up(size, _mi_os_page_size());
     const size_t blocks = mi_region_block_count(size);
-    mi_region_info_t info = mi_atomic_read(&region->info);
-    bool is_large;
-    bool is_committed;
-    void* start = mi_region_info_read(info, &is_large, &is_committed);
-    mi_assert_internal(start != NULL);
-    void* blocks_start = (uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE);
+    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
+    mi_region_info_t info;
+    info.value = mi_atomic_read(&region->info);
+    mi_assert_internal(info.value != 0);
+    void* blocks_start = mi_region_blocks_start(region, bit_idx);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
     mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
     if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
 
-    // decommit (or reset) the blocks to reduce the working set.
-    // TODO: implement delayed decommit/reset as these calls are too expensive
-    // if the memory is reused soon.
-    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
-    if (!is_large &&
-        mi_option_is_enabled(mi_option_segment_reset) &&
-        mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use `option_page_reset` instead            
-    {
-      // note: don't use `_mi_mem_reset` as it is shared with other threads!
-      _mi_os_reset(p, size, tld->stats);    // TODO: maintain reset bits to unreset  
-    }
-    if (!is_committed) {
-      // adjust commit statistics as we commit again when re-using the same slot
-      _mi_stat_decrease(&tld->stats->committed, mi_good_commit_size(size));
+    // committed?
+    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
+      mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
     }
 
-    // TODO: should we free empty regions? currently only done _mi_mem_collect.
-    // this frees up virtual address space which might be useful on 32-bit systems?
+    if (any_reset) {
+      // set the is_reset bits if any pages were reset
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+    }
+
+    // reset the blocks to reduce the working set.
+    if (!info.is_large && mi_option_is_enabled(mi_option_segment_reset) &&
+        mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use only `option_page_reset` instead            
+    {
+      bool any_unreset;
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      if (any_unreset) {
+        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
+      }
+    }    
 
     // and unclaim
-    mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    mi_assert_internal(all_unclaimed); UNUSED(all_unclaimed);
   }
 }
 
@@ -416,13 +439,14 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
       } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
       if (m == 0) {
         // on success, free the whole region
-        bool is_eager_committed;
-        void* start = mi_region_info_read(mi_atomic_read(&regions[i].info), NULL, &is_eager_committed);
-        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, tld->stats);
+        void* start = mi_atomic_read_ptr(&regions[i].start);
+        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
+        memset(&regions[i], 0, sizeof(mem_region_t));
+        // and release the whole region
+        mi_atomic_write(&region->info, 0);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {          
+          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
         }
-        // and release
-        mi_atomic_write(&region->info,0);
       }
     }
   }
@@ -432,6 +456,7 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
 /* ----------------------------------------------------------------------------
   Other
 -----------------------------------------------------------------------------*/
+
 bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
   return _mi_os_reset(p, size, tld->stats);
 }
diff --git a/src/options.c b/src/options.c
index 8c4c1707..9b6e4cd0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,10 +65,11 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
   { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 1, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
   { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset decommits memory
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset uses decommit/commit
+  { 500,UNINIT, MI_OPTION(reset_delay) },        // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
diff --git a/src/os.c b/src/os.c
index 02683a02..553d72c9 100644
--- a/src/os.c
+++ b/src/os.c
@@ -77,11 +77,11 @@ static bool use_large_os_page(size_t size, size_t alignment) {
 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
   size_t align_size;
-  if (size < 512 * KiB) align_size = _mi_os_page_size();
-  else if (size < 2 * MiB) align_size = 64 * KiB;
-  else if (size < 8 * MiB) align_size = 256 * KiB;
-  else if (size < 32 * MiB) align_size = 1 * MiB;
-  else align_size = 4 * MiB;
+  if (size < 512*KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MiB) align_size = 64*KiB;
+  else if (size < 8*MiB) align_size = 256*KiB;
+  else if (size < 32*MiB) align_size = 1*MiB;
+  else align_size = 4*MiB;
   if (size >= (SIZE_MAX - align_size)) return size; // possible overflow?
   return _mi_align_up(size, align_size);
 }
@@ -92,8 +92,8 @@ size_t _mi_os_good_alloc_size(size_t size) {
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
 // We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
 #include <winternl.h>
-typedef PVOID(__stdcall* PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
-typedef NTSTATUS(__stdcall* PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
@@ -129,7 +129,7 @@ static bool mi_win_enable_large_os_pages()
     if (err == 0) err = GetLastError();
     _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
   }
-  return (ok != 0);
+  return (ok!=0);
 }
 
 void _mi_os_init(void) {
@@ -144,7 +144,7 @@ void _mi_os_init(void) {
   if (hDll != NULL) {
     // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
     pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2 == NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
     FreeLibrary(hDll);
   }
   hDll = LoadLibrary(TEXT("ntdll.dll"));
@@ -170,7 +170,7 @@ void _mi_os_init() {
     os_alloc_granularity = os_page_size;
   }
   if (mi_option_is_enabled(mi_option_large_os_pages)) {
-    large_os_page_size = 2 * MiB;
+    large_os_page_size = 2*MiB;
   }
 }
 #endif
@@ -210,7 +210,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
 #if (MI_INTPTR_SIZE >= 8)
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
+  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
     return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
   }
 #endif
@@ -233,7 +233,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
   if ((large_only || use_large_os_page(size, try_alignment))
-    && allow_large && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0) {
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
@@ -247,12 +247,12 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
       if (large_only) return p;
       // fall back to non-large page allocation on error (`p == NULL`).
       if (p == NULL) {
-        mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
+        mi_atomic_write(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
       }
     }
   }
   if (p == NULL) {
-    *is_large = ((flags & MEM_LARGE_PAGES) != 0);
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
     p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
   }
   if (p == NULL) {
@@ -264,8 +264,8 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
 #elif defined(__wasi__)
 static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
   uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
-  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t)try_alignment);
-  size_t alloc_size = _mi_align_up(aligned_base - base + size, _mi_os_page_size());
+  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
+  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
   mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
   if (alloc_size < size) return NULL;
   if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
@@ -278,50 +278,50 @@ static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
 #define MI_OS_USE_MMAP
 static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
   void* p = NULL;
-#if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
   // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
   if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
-    p = mmap(hint, size, protect_flags, flags, fd, 0);
-    if (p == MAP_FAILED) p = NULL; // fall back to regular mmap
+    p = mmap(hint,size,protect_flags,flags,fd,0);
+    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
   }
-#else
+  #else
   UNUSED(try_alignment);
-#endif
-  if (p == NULL) {
-    p = mmap(addr, size, protect_flags, flags, fd, 0);
-    if (p == MAP_FAILED) p = NULL;
+  #endif
+  if (p==NULL) {
+    p = mmap(addr,size,protect_flags,flags,fd,0);
+    if (p==MAP_FAILED) p = NULL;
   }
   return p;
 }
 
 static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
   void* p = NULL;
-#if !defined(MAP_ANONYMOUS)
-#define MAP_ANONYMOUS  MAP_ANON
-#endif
-#if !defined(MAP_NORESERVE)
-#define MAP_NORESERVE  0
-#endif
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
   int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
   int fd = -1;
-#if defined(MAP_ALIGNED)  // BSD
+  #if defined(MAP_ALIGNED)  // BSD
   if (try_alignment > 0) {
     size_t n = _mi_bsr(try_alignment);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
       flags |= MAP_ALIGNED(n);
     }
   }
-#endif
-#if defined(PROT_MAX)
+  #endif
+  #if defined(PROT_MAX)
   protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
-#endif
-#if defined(VM_MAKE_TAG)
-// macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  #endif
+  #if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
   int os_tag = (int)mi_option_get(mi_option_os_tag);
   if (os_tag < 100 || os_tag > 255) os_tag = 100;
   fd = VM_MAKE_TAG(os_tag);
-#endif
+  #endif
   if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
     static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
@@ -335,39 +335,39 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
     else {
       int lflags = flags;
       int lfd = fd;
-#ifdef MAP_ALIGNED_SUPER
+      #ifdef MAP_ALIGNED_SUPER
       lflags |= MAP_ALIGNED_SUPER;
-#endif
-#ifdef MAP_HUGETLB
+      #endif
+      #ifdef MAP_HUGETLB
       lflags |= MAP_HUGETLB;
-#endif
-#ifdef MAP_HUGE_1GB
+      #endif
+      #ifdef MAP_HUGE_1GB
       static bool mi_huge_pages_available = true;
       if ((size % GiB) == 0 && mi_huge_pages_available) {
         lflags |= MAP_HUGE_1GB;
       }
       else
-#endif
+      #endif
       {
-#ifdef MAP_HUGE_2MB
+        #ifdef MAP_HUGE_2MB
         lflags |= MAP_HUGE_2MB;
-#endif
+        #endif
       }
-#ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
       lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-#endif
+      #endif
       if (large_only || lflags != flags) {
         // try large OS page allocation
         *is_large = true;
         p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-#ifdef MAP_HUGE_1GB
+        #ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
           p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
         }
-#endif
+        #endif
         if (large_only) return p;
         if (p == NULL) {
           mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
@@ -378,7 +378,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   if (p == NULL) {
     *is_large = false;
     p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-#if defined(MADV_HUGEPAGE)
+    #if defined(MADV_HUGEPAGE)
     // Many Linux systems don't allow MAP_HUGETLB but they support instead
     // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
     // though since properly aligned allocations will already use large pages if available
@@ -390,7 +390,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
         *is_large = true; // possibly
       };
     }
-#endif
+    #endif
   }
   return p;
 }
@@ -404,18 +404,18 @@ static volatile _Atomic(intptr_t) aligned_base;
 // Return a 4MiB aligned address that is probably available
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if ((size % MI_SEGMENT_SIZE) != 0) return NULL;
+  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
   intptr_t hint = mi_atomic_add(&aligned_base, size);
-  if (hint == 0 || hint > ((intptr_t)30 << 40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
+  if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
     intptr_t init = ((intptr_t)4 << 40); // start at 4TiB area
-#if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
+    #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
     uintptr_t r = _mi_random_init((uintptr_t)&mi_os_get_aligned_hint ^ hint);
-    init = init + (MI_SEGMENT_SIZE * ((r >> 17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
-#endif
+    init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
+    #endif
     mi_atomic_cas_strong(mi_atomic_cast(uintptr_t, &aligned_base), init, hint + size);
     hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
   }
-  if (hint % try_alignment != 0) return NULL;
+  if (hint%try_alignment != 0) return NULL;
   return (void*)hint;
 }
 #else
@@ -444,17 +444,17 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
   }
   */
 
-#if defined(_WIN32)
-  int flags = MEM_RESERVE;
-  if (commit) flags |= MEM_COMMIT;
-  p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-#elif defined(__wasi__)
-  *is_large = false;
-  p = mi_wasm_heap_grow(size, try_alignment);
-#else
-  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-  p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
-#endif
+  #if defined(_WIN32)
+    int flags = MEM_RESERVE;
+    if (commit) flags |= MEM_COMMIT;
+    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  #elif defined(__wasi__)
+    *is_large = false;
+    p = mi_wasm_heap_grow(size, try_alignment);
+  #else
+    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  #endif
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
@@ -564,7 +564,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
     allow_large = *large;
     *large = false;
   }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large != NULL ? large : &allow_large), tld->stats);
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), tld->stats);
 }
 
 
@@ -616,7 +616,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     _mi_stat_decrease(&stats->committed, csize);
   }
 
-#if defined(_WIN32)
+  #if defined(_WIN32)
   if (commit) {
     // if the memory was already committed, the call succeeds but it is not zero'd
     // *is_zero = true;
@@ -627,9 +627,9 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
     err = (ok ? 0 : GetLastError());
   }
-#elif defined(__wasi__)
+  #elif defined(__wasi__)
   // WebAssembly guests can't control memory protection
-#elif defined(MAP_FIXED)
+  #elif defined(MAP_FIXED)
   if (!commit) {
     // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
     void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
@@ -640,10 +640,10 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
     if (err != 0) { err = errno; }
   }
-#else
+  #else
   err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
   if (err != 0) { err = errno; }
-#endif
+  #endif
   if (err != 0) {
     _mi_warning_message("%s error: start: 0x%p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
   }
@@ -674,24 +674,24 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
   if (reset) _mi_stat_increase(&stats->reset, csize);
-  else _mi_stat_decrease(&stats->reset, csize);
+        else _mi_stat_decrease(&stats->reset, csize);
   if (!reset) return true; // nothing to do on unreset!
 
-#if (MI_DEBUG>1)
-  if (MI_SECURE == 0) {
+  #if (MI_DEBUG>1)
+  if (MI_SECURE==0) {
     memset(start, 0, csize); // pretend it is eagerly reset
   }
-#endif
+  #endif
 
 #if defined(_WIN32)
   // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
   void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
   mi_assert_internal(p == start);
-#if 1
+  #if 1
   if (p == start && start != NULL) {
-    VirtualUnlock(start, csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
   }
-#endif
+  #endif
   if (p != start) return false;
 #else
 #if defined(MADV_FREE)
@@ -748,7 +748,7 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   if (csize == 0) return false;
   /*
   if (_mi_os_is_huge_reserved(addr)) {
-    _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
   */
   int err = 0;
@@ -780,7 +780,7 @@ bool _mi_os_unprotect(void* addr, size_t size) {
 
 bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
   // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize&& p != NULL);
+  mi_assert_internal(oldsize > newsize && p != NULL);
   if (oldsize < newsize || p == NULL) return false;
   if (oldsize == newsize) return true;
 
@@ -808,20 +808,20 @@ and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 #if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 {
-  mi_assert_internal(size % GiB == 0);
+  mi_assert_internal(size%GiB == 0);
   mi_assert_internal(addr != NULL);
   const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
 
   mi_win_enable_large_os_pages();
 
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   static bool mi_huge_pages_available = true;
   if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-#ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-#define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-#endif
+    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
+    #endif
     params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
     params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
     ULONG param_count = 1;
@@ -848,7 +848,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
     params[0].ULong = (unsigned)numa_node;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
-#endif
+  #endif
   // otherwise use regular virtual alloc on older windows
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
@@ -869,16 +869,16 @@ static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, cons
 }
 #endif
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size % GiB == 0);
+  mi_assert_internal(size%GiB == 0);
   bool is_large = true;
   void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
-  if (numa_node >= 0 && numa_node < 8 * MI_INTPTR_SIZE) { // at most 64 nodes
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     uintptr_t numa_mask = (1UL << numa_node);
     // TODO: does `mbind` work correctly for huge OS pages? should we
     // use `set_mempolicy` before calling mmap instead?
     // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8 * MI_INTPTR_SIZE, 0);
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
     if (err != 0) {
       _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
     }
@@ -910,7 +910,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
       start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
 #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
       uintptr_t r = _mi_random_init((uintptr_t)&mi_os_claim_huge_pages);
-      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r >> 17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
 #endif
     }
     end = start + size;
@@ -963,8 +963,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
     if (max_msecs > 0) {
       mi_msecs_t elapsed = _mi_clock_end(start_t);
       if (page >= 1) {
-        mi_msecs_t estimate = ((elapsed / (page + 1)) * pages);
-        if (estimate > 2 * max_msecs) { // seems like we are going to timeout, break
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
           elapsed = max_msecs + 1;
         }
       }
@@ -974,7 +974,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       }
     }
   }
-  mi_assert_internal(page * MI_HUGE_OS_PAGE_SIZE <= size);
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
   if (pages_reserved != NULL) *pages_reserved = page;
   if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
   return (page == 0 ? NULL : start);
@@ -983,7 +983,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
 void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
-  if (p == NULL || size == 0) return;
+  if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
     _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
@@ -999,7 +999,7 @@ static size_t mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
   USHORT numa_node = 0;
   GetCurrentProcessorNumberEx(&pnum);
-  GetNumaProcessorNodeEx(&pnum, &numa_node);
+  GetNumaProcessorNodeEx(&pnum,&numa_node);
   return numa_node;
 }
 
@@ -1026,12 +1026,12 @@ static size_t mi_os_numa_nodex(void) {
 static size_t mi_os_numa_node_countx(void) {
   char buf[128];
   unsigned node = 0;
-  for (node = 0; node < 256; node++) {
+  for(node = 0; node < 256; node++) {
     // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
     snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
-    if (access(buf, R_OK) != 0) break;
+    if (access(buf,R_OK) != 0) break;
   }
-  return (node + 1);
+  return (node+1);
 }
 #else
 static size_t mi_os_numa_nodex(void) {
@@ -1058,7 +1058,7 @@ size_t _mi_os_numa_node_count_get(void) {
 int _mi_os_numa_node_get(mi_os_tld_t* tld) {
   UNUSED(tld);
   size_t numa_count = _mi_os_numa_node_count();
-  if (numa_count <= 1) return 0; // optimize on single numa node systems: always node 0
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
   size_t numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
diff --git a/src/page.c b/src/page.c
index 9085ccb5..df6ecc71 100644
--- a/src/page.c
+++ b/src/page.c
@@ -75,7 +75,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
 
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL,NULL));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -229,6 +229,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
   mi_assert_internal(page->heap == NULL);
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  mi_assert_internal(!page->is_reset);  
   _mi_page_free_collect(page,false);
   mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
   mi_page_queue_push(heap, pq, page);
@@ -342,7 +343,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(pq == mi_page_queue_of(page));
   mi_assert_internal(page->heap != NULL);
-
+  
 #if MI_DEBUG > 1
   mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
 #endif
@@ -597,7 +598,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(block_size > 0);
   // set fields
   size_t page_size;
-  _mi_segment_page_start(segment, page, block_size, &page_size);
+  _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
   page->block_size = block_size;
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
diff --git a/src/segment.c b/src/segment.c
index 549dd339..ffba8c0d 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -13,6 +13,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_PAGE_HUGE_ALIGN  (256*1024)
 
+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
+
 /* -----------------------------------------------------------
   Segment allocation
   We allocate pages inside big OS allocated "segments"
@@ -40,7 +42,6 @@ terms of the MIT license. A copy of the license can be found in the file
   Queue of segments containing free pages
 ----------------------------------------------------------- */
 
-
 #if (MI_DEBUG>=3)
 static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, mi_segment_t* segment) {
   mi_assert_internal(segment != NULL);
@@ -143,31 +144,50 @@ static bool mi_segment_is_valid(mi_segment_t* segment) {
 }
 #endif
 
+
+/* -----------------------------------------------------------
+  Page reset
+----------------------------------------------------------- */
+
+static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld) {
+  if (!mi_option_is_enabled(mi_option_page_reset)) return;
+  if (segment->mem_is_fixed || page->segment_in_use || page->is_reset) return;
+  size_t psize;
+  void* start = mi_segment_raw_page_start(segment, page, &psize);
+  page->is_reset = true;
+  mi_assert_internal(size <= psize);
+  _mi_mem_reset(start, ((size == 0 || size > psize) ? psize : size), tld->os);
+}
+
+static void mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
+{  
+  mi_assert_internal(page->is_reset);  
+  mi_assert_internal(!segment->mem_is_fixed);
+  page->is_reset = false;
+  size_t psize;
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
+  bool is_zero = false;
+  _mi_mem_unreset(start, ((size == 0 || size > psize) ? psize : size), &is_zero, tld->os);
+  if (is_zero) page->is_zero_init = true;
+}
+
+
 /* -----------------------------------------------------------
  Segment size calculations
 ----------------------------------------------------------- */
 
-// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size)
-{
+// Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+// The raw start is not taking aligned block allocation into consideration.
+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
   size_t   psize = (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
-  uint8_t* p     = (uint8_t*)segment + page->segment_idx*psize;
+  uint8_t* p = (uint8_t*)segment + page->segment_idx * psize;
 
   if (page->segment_idx == 0) {
     // the first page starts after the segment info (and possible guard page)
-    p     += segment->segment_info_size;
+    p += segment->segment_info_size;
     psize -= segment->segment_info_size;
-    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
-    if (block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
-      size_t adjust = block_size - ((uintptr_t)p % block_size);
-      if (adjust < block_size) {
-        p     += adjust;
-        psize -= adjust;
-      }
-      mi_assert_internal((uintptr_t)p % block_size == 0);
-    }
   }
-  
+
   if (MI_SECURE > 1 || (MI_SECURE == 1 && page->segment_idx == segment->capacity - 1)) {
     // secure == 1: the last page has an os guard page at the end
     // secure >  1: every page has an os guard page
@@ -175,19 +195,36 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
   }
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(_mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
-static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) {
-  /*
-  if (mi_option_is_enabled(mi_option_secure)) {
-    // always reserve maximally so the protection falls on
-    // the same address area, as we need to reuse them from the caches interchangably.
-    capacity = MI_SMALL_PAGES_PER_SEGMENT;
+// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+{
+  size_t   psize;
+  uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
+  if (pre_size != NULL) *pre_size = 0;
+  if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
+    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    size_t adjust = block_size - ((uintptr_t)p % block_size);
+    if (adjust < block_size) {
+      p += adjust;
+      psize -= adjust;
+      if (pre_size != NULL) *pre_size = adjust;
+    }
+    mi_assert_internal((uintptr_t)p % block_size == 0);
   }
-  */
+    
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(page->block_size==0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) 
+{
   const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
   size_t guardsize = 0;
   size_t isize     = 0;
@@ -234,7 +271,15 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
     mi_assert_internal(!segment->mem_is_fixed);
     _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
   }
-  _mi_mem_free(segment, segment_size, segment->memid, tld->os);
+  
+  bool fully_committed = true;
+  bool any_reset = false;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    const mi_page_t* page = &segment->pages[i];    
+    if (!page->is_committed) fully_committed = false;
+    if (page->is_reset) any_reset = true;
+  }
+  _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
 }
 
 
@@ -275,7 +320,7 @@ static bool mi_segment_cache_full(mi_segments_tld_t* tld)
 
 static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
-  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->next == NULL);  
   if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
     return false;
   }
@@ -328,31 +373,31 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
   bool commit = eager || (page_kind >= MI_PAGE_LARGE);
-  bool protection_still_good = false;
+  bool pages_still_good = false;
   bool is_zero = false;
   
   // Try to get it from our thread local cache first
-  mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
+  mi_segment_t* segment = NULL; // mi_segment_cache_pop(segment_size, tld);
   if (segment != NULL) {
-    if (MI_SECURE!=0) {
-      mi_assert_internal(!segment->mem_is_fixed);
-      if (segment->page_kind != page_kind) {
+    if (page_kind <= MI_PAGE_MEDIUM && segment->page_kind == page_kind && segment->segment_size == segment_size) {
+      pages_still_good = true;
+    }
+    else 
+    {
+      // different page kinds; unreset any reset pages, and unprotect
+      // TODO: optimize cache pop to return fitting pages if possible?
+      for (size_t i = 0; i < segment->capacity; i++) {
+        mi_page_t* page = &segment->pages[i];
+        if (page->is_reset) { 
+          mi_page_unreset(segment, page, 0, tld);  // todo: only unreset the part that was reset? (instead of the full page)
+        }
+      }
+      if (MI_SECURE!=0) {
+        mi_assert_internal(!segment->mem_is_fixed);
+        // TODO: should we unprotect per page? (with is_protected flag?)
         _mi_mem_unprotect(segment, segment->segment_size); // reset protection if the page kind differs
       }
-      else {
-        protection_still_good = true; // otherwise, the guard pages are still in place
-      }
-    }
-    if (!segment->mem_is_committed && page_kind > MI_PAGE_MEDIUM) {
-      mi_assert_internal(!segment->mem_is_fixed);
-      _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->os);
-      segment->mem_is_committed = true;
-    }
-    if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_page_reset)) {
-      bool reset_zero = false;
-      _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->os);
-      if (reset_zero) is_zero = true;
-    }
+    }    
   }
   else {
     // Allocate the segment from the OS
@@ -373,27 +418,42 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   }
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
 
-  // zero the segment info (but not the `mem` fields)
-  ptrdiff_t ofs = offsetof(mi_segment_t,next);
-  memset((uint8_t*)segment + ofs, 0, info_size - ofs);    
-
-  // guard pages
-  if ((MI_SECURE != 0) && !protection_still_good) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data
-    mi_assert_internal( info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
-    _mi_mem_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
-    size_t os_page_size = _mi_os_page_size();
-    if (MI_SECURE <= 1) {
-      // and protect the last page too
-      _mi_mem_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
-    }
-    else {
-      // protect every page
-      for (size_t i = 0; i < capacity; i++) {
-        _mi_mem_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
+  if (!pages_still_good) {    
+    // guard pages
+    if (MI_SECURE != 0) {
+      // in secure mode, we set up a protected page in between the segment info
+      // and the page data
+      mi_assert_internal(info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
+      _mi_mem_protect((uint8_t*)segment + info_size, (pre_size - info_size));
+      const size_t os_page_size = _mi_os_page_size();
+      if (MI_SECURE <= 1) {
+        // and protect the last page too
+        _mi_mem_protect((uint8_t*)segment + segment_size - os_page_size, os_page_size);
+      }
+      else {
+        // protect every page
+        for (size_t i = 0; i < capacity; i++) {
+          _mi_mem_protect((uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size);
+        }
       }
     }
+
+    // zero the segment info (but not the `mem` fields)
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    memset((uint8_t*)segment + ofs, 0, info_size - ofs);
+
+    // initialize pages info
+    for (uint8_t i = 0; i < capacity; i++) {
+      segment->pages[i].segment_idx = i;
+      segment->pages[i].is_reset = false;
+      segment->pages[i].is_committed = commit;
+      segment->pages[i].is_zero_init = is_zero;
+    }
+  }
+  else {
+    // zero the segment info but not the pages info (and mem fields)
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    memset((uint8_t*)segment + ofs, 0, offsetof(mi_segment_t,pages) - ofs);
   }
 
   // initialize
@@ -404,13 +464,8 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   segment->segment_info_size = pre_size;
   segment->thread_id  = _mi_thread_id();
   segment->cookie = _mi_ptr_cookie(segment);
-  for (uint8_t i = 0; i < segment->capacity; i++) {
-    segment->pages[i].segment_idx = i;
-    segment->pages[i].is_reset = false;
-    segment->pages[i].is_committed = commit;
-    segment->pages[i].is_zero_init = is_zero;
-  }
   _mi_stat_increase(&tld->stats->page_committed, segment->segment_info_size);
+  
   //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
   return segment;
 }
@@ -463,24 +518,22 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t*
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
     if (!page->segment_in_use) {
-      if (page->is_reset || !page->is_committed) {
+      // set in-use before doing unreset to prevent delayed reset
+      page->segment_in_use = true;
+      segment->used++;                
+      if (!page->is_committed) {
+        mi_assert_internal(!segment->mem_is_fixed);
+        mi_assert_internal(!page->is_reset);
         size_t psize;
-        uint8_t* start = _mi_page_start(segment, page, &psize);        
-        if (!page->is_committed) {
-          mi_assert_internal(!segment->mem_is_fixed);
-          page->is_committed = true;
-          bool is_zero = false;
-          _mi_mem_commit(start,psize,&is_zero,tld->os);
-          if (is_zero) page->is_zero_init = true;
-        }
-        if (page->is_reset) {
-          mi_assert_internal(!segment->mem_is_fixed);
-          page->is_reset = false;
-          bool is_zero = false;
-          _mi_mem_unreset(start, psize, &is_zero, tld->os);
-          if (is_zero) page->is_zero_init = true;
-        }
+        uint8_t* start = _mi_page_start(segment, page, &psize);
+        page->is_committed = true;
+        bool is_zero = false;
+        _mi_mem_commit(start,psize,&is_zero,tld->os);
+        if (is_zero) page->is_zero_init = true;
       }
+      if (page->is_reset) {
+        mi_page_unreset(segment, page, 0, tld); // todo: only unreset the part that was reset?
+      }      
       return page;
     }
   }
@@ -503,22 +556,21 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_seg
   _mi_stat_decrease(&tld->stats->page_committed, inuse);
   _mi_stat_decrease(&tld->stats->pages, 1);
   
-  // reset the page memory to reduce memory pressure?
-  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) 
-       // && segment->page_kind <= MI_PAGE_MEDIUM) // to prevent partial overlapping resets
-  {
-    size_t psize;
-    uint8_t* start = _mi_page_start(segment, page, &psize);
-    page->is_reset = true;
-    _mi_mem_reset(start, psize, tld->os);
-  }
+  // calculate the used size from the raw (non-aligned) start of the page
+  size_t pre_size;
+  _mi_segment_page_start(segment, page, page->block_size, NULL, &pre_size);
+  size_t used_size = pre_size + (page->capacity * page->block_size);
 
-  // zero the page data, but not the segment fields
+  // zero the page data, but not the segment fields  
   page->is_zero_init = false;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
   page->segment_in_use = false;
   segment->used--;
+
+  // reset the page memory to reduce memory pressure?
+  // note: must come after setting `segment_in_use` to false
+  mi_page_reset(segment, page, used_size, tld);
 }
 
 void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
@@ -568,7 +620,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   // remove the segment from the free page queue if needed
   mi_segment_remove_from_free_queue(segment,tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-
+  
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
@@ -628,6 +680,8 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
     for (size_t i = 0; i < segment->capacity; i++) {
       mi_page_t* page = &segment->pages[i];
       if (page->segment_in_use) {
+        mi_assert_internal(!page->is_reset);
+        mi_assert_internal(page->is_committed);
         segment->abandoned--;
         mi_assert(page->next == NULL);
         _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
@@ -636,7 +690,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
           mi_segment_page_clear(segment,page,tld);
         }
         else {
-          // otherwise reclaim it
+          // otherwise reclaim it          
           _mi_page_reclaim(heap,page);
         }
       }
@@ -666,8 +720,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
   mi_page_t* page = mi_segment_find_free(segment, tld);
-  page->segment_in_use = true;  
-  segment->used++;
+  mi_assert_internal(page->segment_in_use);  
   mi_assert_internal(segment->used <= segment->capacity);
   if (segment->used == segment->capacity) {
     // if no more free pages, remove from the queue
@@ -685,7 +738,11 @@ static mi_page_t* mi_segment_page_alloc(mi_page_kind_t kind, size_t page_shift,
     mi_segment_enqueue(free_queue, segment);
   }
   mi_assert_internal(free_queue->first != NULL);
-  return mi_segment_page_alloc_in(free_queue->first,tld);
+  mi_page_t* page = mi_segment_page_alloc_in(free_queue->first,tld);
+#if MI_DEBUG>=2
+  _mi_segment_page_start(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+#endif
+  return page;
 }
 
 static mi_page_t* mi_segment_small_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
@@ -706,6 +763,9 @@ static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_
   segment->used = 1;
   mi_page_t* page = &segment->pages[0];
   page->segment_in_use = true;
+#if MI_DEBUG>=2
+  _mi_segment_page_start(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+#endif
   return page;
 }
 
@@ -717,7 +777,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
   segment->used = 1;
   segment->thread_id = 0; // huge pages are immediately abandoned
   mi_page_t* page = &segment->pages[0];
-  page->segment_in_use = true;
+  page->segment_in_use = true;  
   return page;
 }
 

From 049dbf41bacbf8a839551cd3e7710ffa1925b770 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Wed, 20 Nov 2019 15:44:07 -0800
Subject: [PATCH 48/48] fix commit bits for huge page allocations

---
 src/memory.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index 94b6348f..214bf0d3 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -181,6 +181,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
   if (start == NULL) return false;
   mi_assert_internal(!(region_large && !allow_large));
+  mi_assert_internal(!region_large || region_commit);
 
   // claim a fresh slot
   const uintptr_t idx = mi_atomic_increment(&regions_count);
@@ -194,8 +195,8 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   mem_region_t* r = &regions[idx];
   r->arena_memid  = arena_memid;
   mi_atomic_write(&r->in_use, 0);
-  mi_atomic_write(&r->dirty, (is_zero ? 0 : ~0UL));
-  mi_atomic_write(&r->commit, (region_commit ? ~0UL : 0));
+  mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
   mi_atomic_write(&r->reset, 0);
   *bit_idx = 0;
   mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
@@ -291,6 +292,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
     bool any_uncommitted;
     mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
     if (any_uncommitted) {
+      mi_assert_internal(!info.is_large);
       bool commit_zero;
       _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
       if (commit_zero) *is_zero = true;
@@ -304,6 +306,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
 
   // unreset reset blocks
   if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    mi_assert_internal(!info.is_large);
     mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit); 
     mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
     bool reset_zero;