From 288726606390edb4ffb9664b9bce0271516b550d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 14:17:36 -0800
Subject: [PATCH 01/12] optimize get numa node for single node systems

---
 src/os.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/os.c b/src/os.c
index 44ef9830..254f85f1 100644
--- a/src/os.c
+++ b/src/os.c
@@ -1046,9 +1046,10 @@ int _mi_os_numa_node_count(void) {
 
 int _mi_os_numa_node(mi_os_tld_t* tld) {
   UNUSED(tld);
-  int numa_node = mi_os_numa_nodex();
-  // never more than the node count and >= 0
   int numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  int numa_node = mi_os_numa_nodex();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
   if (numa_node < 0) numa_node = 0;  
   return numa_node;

From 00e19cad9abd225bb4c0975c4f9b6e440a81b97c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 21:37:23 -0800
Subject: [PATCH 02/12] refactor region code, split out atomic bitmap

---
 ide/vs2019/mimalloc-override.vcxproj |   2 +-
 ide/vs2019/mimalloc.vcxproj          |   3 +-
 include/mimalloc-atomic.h            |  31 ++-
 src/bitmap.inc.c                     | 160 +++++++++++++
 src/memory.c                         | 339 ++++++++++-----------------
 5 files changed, 318 insertions(+), 217 deletions(-)
 create mode 100644 src/bitmap.inc.c

diff --git a/ide/vs2019/mimalloc-override.vcxproj b/ide/vs2019/mimalloc-override.vcxproj
index 09fd37fb..e1c7535c 100644
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@@ -123,7 +123,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
       <CompileAs>Default</CompileAs>
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 1fabff5e..19696c10 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -116,7 +116,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp17</LanguageStandard>
@@ -218,6 +218,7 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c" />
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index dff0f011..c18f990f 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -36,6 +36,13 @@ static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
 // Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);
 
+// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+
 // Atomically compare and exchange a value; returns `true` if successful. 
 // May fail spuriously. Memory ordering as release on success, and relaxed on failure.
 // (Note: expected and desired are in opposite order from atomic_compare_exchange)
@@ -121,22 +128,28 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
-#define RC64(f)  f##64
+#define MI_64(f) f##64
 #else
 typedef LONG     msc_intptr_t;
-#define RC64(f)  f
+#define MI_64(f) f
 #endif
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+  return (intptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   return mi_atomic_cas_strong(p,desired,expected);
 }
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
-  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
 static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
   return *p;
@@ -177,6 +190,14 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
   MI_USING_STD
   return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_and_explicit(p, x, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_or_explicit(p, x, memory_order_relaxed);
+}
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
   return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
new file mode 100644
index 00000000..5bea4748
--- /dev/null
+++ b/src/bitmap.inc.c
@@ -0,0 +1,160 @@
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// Use bit scan forward to quickly find the first zero bit if it is available
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+
+// An atomic bitmap of `uintptr_t` fields
+typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*           mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// The bit mask for a given number of blocks at a specified bit index.
+static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  return ((((uintptr_t)1 << count) - 1) << bitidx);
+}
+
+// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
+// Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
+{  
+  mi_assert_internal(bitmap_idx != NULL);
+  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map  = mi_atomic_read(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const uintptr_t mask = mi_bitmap_mask_(count, 0);
+  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_BITSCAN
+  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(field);
+        continue;
+      }
+      else {
+        // success, we claimed the bits!        
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_BITSCAN
+      size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+  for (size_t idx = 0; idx < bitmap_fields; idx++) {
+    if (mi_bitmap_try_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+static inline void mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal((bitmap[idx] & mask) == mask);
+  mi_atomic_and(&bitmap[idx], ~mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  return ((prev & mask) == 0);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/memory.c b/src/memory.c
index 75a1df92..29e0e412 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -37,6 +37,8 @@ Possible issues:
 
 #include <string.h>  // memset
 
+#include "bitmap.inc.c"
+
 // Internal raw OS interface
 size_t  _mi_os_large_page_size();
 bool    _mi_os_protect(void* addr, size_t size);
@@ -56,22 +58,22 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 16KiB for the region map
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // 196 bytes for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
-#define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
-#define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
-#define MI_REGION_MAP_FULL        UINTPTR_MAX
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB
+#define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  
 
 
+// Region info is a pointer to the memory region and two bits for 
+// its flags: is_large, and is_committed.
 typedef uintptr_t mi_region_info_t;
 
 static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
@@ -88,19 +90,22 @@ static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, b
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
-  volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
-  volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
+  volatile _Atomic(mi_region_info_t) info;       // start of the memory area (and flags)
   volatile _Atomic(uintptr_t)        numa_node;  // associated numa node + 1 (so 0 is no association)
-  size_t   arena_memid;  // if allocated from a (huge page) arena
+  size_t   arena_memid;                          // if allocated from a (huge page) arena
 } mem_region_t;
 
-
 // The region map; 16KiB for a 256GiB HEAP_REGION_MAX
-// TODO: in the future, maintain a map per NUMA node for numa aware allocation
 static mem_region_t regions[MI_REGION_MAX];
 
-static volatile _Atomic(uintptr_t) regions_count; // = 0;        // allocated regions
+// A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
+static mi_bitmap_field_t regions_map[MI_REGION_MAX];
+
+// A bit mask per region to track which blocks are dirty (= potentially written to)
+static mi_bitmap_field_t regions_dirty[MI_REGION_MAX];
+
+// Allocated regions
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        
 
 
 /* ----------------------------------------------------------------------------
@@ -113,12 +118,6 @@ static size_t mi_region_block_count(size_t size) {
   return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
 }
 
-// The bit mask for a given number of blocks at a specified bit index.
-static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
-  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
-  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
-}
-
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
 static size_t mi_good_commit_size(size_t size) {
   if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
@@ -137,8 +136,8 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 }
 
 
-static size_t mi_memid_create(size_t idx, size_t bitidx) {
-  return ((idx*MI_REGION_MAP_BITS) + bitidx)<<1;
+static size_t mi_memid_create(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx<<1;
 }
 
 static size_t mi_memid_create_from_arena(size_t arena_memid) {
@@ -149,78 +148,57 @@ static bool mi_memid_is_arena(size_t id) {
   return ((id&1)==1);
 }
 
-static bool mi_memid_indices(size_t id, size_t* idx, size_t* bitidx, size_t* arena_memid) {
+static bool mi_memid_indices(size_t id, mi_bitmap_index_t* bitmap_idx, size_t* arena_memid) {
   if (mi_memid_is_arena(id)) {
     *arena_memid = (id>>1);
     return true;
   }
   else {
-    *idx = ((id>>1) / MI_REGION_MAP_BITS);
-    *bitidx = ((id>>1) % MI_REGION_MAP_BITS);
+    *bitmap_idx = (mi_bitmap_index_t)(id>>1);
     return false;
   }
 }
 
 /* ----------------------------------------------------------------------------
-Commit from a region
+  Ensure a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
 
-// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, 
-                                    size_t size, bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
+static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_info_t* pinfo, mi_os_tld_t* tld)
 {
-  size_t mask = mi_region_block_mask(blocks,bitidx);
-  mi_assert_internal(mask != 0);
-  mi_assert_internal((mask & mi_atomic_read_relaxed(&region->map)) == mask);
-  mi_assert_internal(&regions[idx] == region);
-
   // ensure the region is reserved
-  mi_region_info_t info = mi_atomic_read(&region->info);
-  if (info == 0) 
+  mi_region_info_t info = mi_atomic_read(&regions[idx].info);
+  if (mi_unlikely(info == 0))
   {
     bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
-    bool region_large  = *allow_large;
+    bool region_large = allow_large;
+    bool is_zero = false;
     size_t arena_memid = 0;
-    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, is_zero, &arena_memid, tld);
-    /*
-    void* start = NULL;
-    if (region_large) {
-      start = _mi_os_try_alloc_from_huge_reserved(MI_REGION_SIZE, MI_SEGMENT_ALIGN);
-      if (start != NULL) { region_commit = true; }
-    }
-    if (start == NULL) {
-      start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, region_commit, &region_large, tld);
-    }
-    */
-    mi_assert_internal(!(region_large && !*allow_large));
+    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+    mi_assert_internal(!(region_large && !allow_large));
 
     if (start == NULL) {
-      // failure to allocate from the OS! unclaim the blocks and fail
-      size_t map;
-      do {
-        map = mi_atomic_read_relaxed(&region->map);
-      } while (!mi_atomic_cas_weak(&region->map, map & ~mask, map));
+      // failure to allocate from the OS! fail
+      *pinfo = 0;
       return false;
     }
 
     // set the newly allocated region
-    info = mi_region_info_create(start,region_large,region_commit);
-    if (mi_atomic_cas_strong(&region->info, info, 0)) {
+    info = mi_region_info_create(start, region_large, region_commit);
+    if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) {
       // update the region count
-      region->arena_memid = arena_memid;
-      mi_atomic_write(&region->numa_node, _mi_os_numa_node(tld) + 1);
+      regions[idx].arena_memid = arena_memid;
+      mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1);
+      mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
       mi_atomic_increment(&regions_count);
     }
     else {
       // failed, another thread allocated just before us!
       // we assign it to a later slot instead (up to 4 tries).
-      for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
+      for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
           regions[idx+i].arena_memid = arena_memid;
           mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
+          mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -232,27 +210,33 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
         // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
       }
       // and continue with the memory at our index
-      info = mi_atomic_read(&region->info);
+      info = mi_atomic_read(&regions[idx].info);
     }
   }
-  mi_assert_internal(info == mi_atomic_read(&region->info));
+  mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
   mi_assert_internal(info != 0);
+  *pinfo = info;
+  return true;
+}
+
+
+/* ----------------------------------------------------------------------------
+  Commit blocks
+-----------------------------------------------------------------------------*/
+
+static void* mi_region_commit_blocks(mi_bitmap_index_t bitmap_idx, mi_region_info_t info, size_t blocks, size_t size, bool* commit, bool* is_large, bool* is_zero, mi_os_tld_t* tld)
+{
+  // set dirty bits
+  *is_zero = mi_bitmap_claim(regions_dirty, MI_REGION_MAX, blocks, bitmap_idx);
 
   // Commit the blocks to memory
   bool region_is_committed = false;
   bool region_is_large = false;
-  void* start = mi_region_info_read(info,&region_is_large,&region_is_committed);  
-  mi_assert_internal(!(region_is_large && !*allow_large));
+  void* start = mi_region_info_read(info, &region_is_large, &region_is_committed);
+  mi_assert_internal(!(region_is_large && !*is_large));
   mi_assert_internal(start!=NULL);
 
-  // set dirty bits
-  uintptr_t m;
-  do {
-    m = mi_atomic_read(&region->dirty_mask);
-  } while (!mi_atomic_cas_weak(&region->dirty_mask, m | mask, m));
-  *is_zero = ((m & mask) == 0); // no dirty bit set in our claimed range?
-
-  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+  void* blocks_start = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bitmap_idx) * MI_SEGMENT_SIZE);
   if (*commit && !region_is_committed) {
     // ensure commit 
     bool commit_zero = false;
@@ -266,99 +250,58 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
 
   // and return the allocation  
   mi_assert_internal(blocks_start != NULL);
-  *allow_large = region_is_large;
-  *p  = blocks_start;
-  *id = mi_memid_create(idx, bitidx); 
+  *is_large = region_is_large;
+  return blocks_start;
+}
+
+/* ----------------------------------------------------------------------------
+  Claim and allocate blocks in a region
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_alloc_blocks(
+  size_t idx, size_t blocks, size_t size,
+  bool* commit, bool* allow_large, bool* is_zero,
+  void** p, size_t* id, mi_os_tld_t* tld)
+{
+  mi_bitmap_index_t bitmap_idx;
+  if (!mi_bitmap_try_claim_field(regions_map, idx, blocks, &bitmap_idx)) {
+    return true; // no error, but also no success
+  }
+  mi_region_info_t info;
+  if (!mi_region_ensure_allocated(idx,*allow_large,&info,tld)) {
+    // failed to allocate region memory, unclaim the bits and fail
+    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
+    return false;
+  }
+  *p = mi_region_commit_blocks(bitmap_idx,info,blocks,size,commit,allow_large,is_zero,tld);
+  *id = mi_memid_create(bitmap_idx);
   return true;
 }
 
-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanForward64(&idx, x);
-  #else
-  _BitScanForward(&idx, x);
-  #endif
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanReverse64(&idx, x);
-  #else
-  _BitScanReverse(&idx, x);
-  #endif
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x));
-}
-#endif
 
-// Allocate `blocks` in a `region` at `idx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, 
-                                   bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
-{
-  mi_assert_internal(p != NULL && id != NULL);
-  mi_assert_internal(blocks < MI_REGION_MAP_BITS);
+/* ----------------------------------------------------------------------------
+  Try to allocate blocks in suitable regions
+-----------------------------------------------------------------------------*/
 
-  const uintptr_t mask = mi_region_block_mask(blocks, 0);
-  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  uintptr_t map = mi_atomic_read(&region->map);
-  if (map==MI_REGION_MAP_FULL) return true;
-
-  #ifdef MI_HAVE_BITSCAN
-  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
-  #else
-  size_t bitidx = 0;               // otherwise start at 0
-  #endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while(bitidx <= bitidx_max) {
-    if ((map & m) == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(&region->map, newmap, map)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(&region->map);
-        continue;
-      }
-      else {
-        // success, we claimed the bits
-        // now commit the block memory -- this can still fail
-        return mi_region_commit_blocks(region, idx, bitidx, blocks, 
-                                       size, commit, allow_large, is_zero, p, id, tld);
-      }
-    }
-    else {
-      // on to the next bit range
-      #ifdef MI_HAVE_BITSCAN
-      size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
-      mi_assert_internal(shift > 0 && shift <= blocks);
-      #else
-      size_t shift = 1;
-      #endif
-      bitidx += shift;
-      m <<= shift;
-    }
+static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool allow_large ) {
+  uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
+  if (m == MI_BITMAP_FIELD_FULL) return false;
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
+    if (rnode != numa_node) return false;
+  }
+  if (mi_unlikely(!(commit || allow_large))) {
+    // otherwise skip incompatible regions if possible. 
+    // this is not guaranteed due to multiple threads allocating at the same time but
+    // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+    // otherwise we might just not be able to reset/decommit individual pages sometimes.
+    mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+    bool is_large;
+    bool is_committed;
+    void* start = mi_region_info_read(info, &is_large, &is_committed);
+    bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
+    if (!ok) return false;
   }
-  // no error, but also no bits found
   return true;
 }
 
@@ -366,33 +309,15 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(int numa_node, size_t idx, size_t blocks, size_t size,
+static bool mi_region_try_alloc_blocks(
+  int numa_node, size_t idx, size_t blocks, size_t size,
   bool* commit, bool* allow_large, bool* is_zero,
   void** p, size_t* id, mi_os_tld_t* tld)
 {
   // check if there are available blocks in the region..
   mi_assert_internal(idx < MI_REGION_MAX);
-  mem_region_t* region = &regions[idx];
-  uintptr_t m = mi_atomic_read_relaxed(&region->map);
-  int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
-  if ((rnode < 0 || rnode == numa_node) &&  // fits current numa node
-      (m != MI_REGION_MAP_FULL))            // and some bits are zero    
-  {
-    bool ok = (*commit || *allow_large); // committing or allow-large is always ok
-    if (!ok) {
-      // otherwise skip incompatible regions if possible. 
-      // this is not guaranteed due to multiple threads allocating at the same time but
-      // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-      // otherwise we might just not be able to reset/decommit individual pages sometimes.
-      mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
-      bool is_large;
-      bool is_committed;
-      void* start = mi_region_info_read(info,&is_large,&is_committed);
-      ok = (start == NULL || (*commit || !is_committed) || (*allow_large || !is_large)); // Todo: test with one bitmap operation?
-    }
-    if (ok) {
-      return mi_region_alloc_blocks(region, idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
-    }
+  if (mi_region_is_suitable(numa_node, idx, *commit, *allow_large)) {
+    return mi_region_alloc_blocks(idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
   }
   return true;  // no error, but no success either
 }
@@ -426,14 +351,14 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   size = _mi_align_up(size, _mi_os_page_size());
 
   // calculate the number of needed blocks
-  size_t blocks = mi_region_block_count(size);
+  const size_t blocks = mi_region_block_count(size);
   mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
 
   // find a range of free blocks
-  int numa_node = _mi_os_numa_node(tld);
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
   void* p = NULL;
-  size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
     if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
@@ -456,7 +381,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
     *id = mi_memid_create_from_arena(arena_memid);
   }
   else {
-    tld->region_idx = idx;  // next start of search? currently not used as we use first-fit
+    tld->region_idx = idx;  // next start of search
   }
 
   mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
@@ -475,9 +400,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
   if (p==NULL) return;
   if (size==0) return;
   size_t arena_memid = 0;
-  size_t idx = 0;
-  size_t bitidx = 0;
-  if (mi_memid_indices(id,&idx,&bitidx,&arena_memid)) {
+  mi_bitmap_index_t bitmap_idx;
+  if (mi_memid_indices(id,&bitmap_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
     _mi_arena_free(p, size, arena_memid, stats);
   }
@@ -487,11 +411,11 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
     size = _mi_align_up(size, _mi_os_page_size());    
-    size_t blocks = mi_region_block_count(size);
-    size_t mask = mi_region_block_mask(blocks, bitidx);
+    const size_t blocks = mi_region_block_count(size);
+    const size_t idx    = mi_bitmap_index_field(bitmap_idx);
+    const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
     mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
     mem_region_t* region = &regions[idx];
-    mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
     mi_region_info_t info = mi_atomic_read(&region->info);
     bool is_large;
     bool is_eager_committed;
@@ -499,8 +423,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     mi_assert_internal(start != NULL);
     void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
-    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
+    mi_assert_internal(bitidx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bitidx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
 
     // decommit (or reset) the blocks to reduce the working set.
     // TODO: implement delayed decommit/reset as these calls are too expensive
@@ -526,12 +450,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // this frees up virtual address space which might be useful on 32-bit systems?
 
     // and unclaim
-    uintptr_t map;
-    uintptr_t newmap;
-    do {
-      map = mi_atomic_read_relaxed(&region->map);
-      newmap = map & ~mask;
-    } while (!mi_atomic_cas_weak(&region->map, newmap, map));
+    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
   }
 }
 
@@ -542,23 +461,23 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 void _mi_mem_collect(mi_stats_t* stats) {
   // free every region that has no segments in use.
   for (size_t i = 0; i < regions_count; i++) {
-    mem_region_t* region = &regions[i];
-    if (mi_atomic_read_relaxed(&region->map) == 0) {
+    if (mi_atomic_read_relaxed(&regions_map[i]) == 0) {
       // if no segments used, try to claim the whole region
       uintptr_t m;
       do {
-        m = mi_atomic_read_relaxed(&region->map);
-      } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
+        m = mi_atomic_read_relaxed(&regions_map[i]);
+      } while(m == 0 && !mi_atomic_cas_weak(&regions_map[i], MI_BITMAP_FIELD_FULL, 0 ));
       if (m == 0) {
         // on success, free the whole region
         bool is_eager_committed;
-        void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
+        void* start = mi_region_info_read(mi_atomic_read(&regions[i].info), NULL, &is_eager_committed);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          _mi_arena_free(start, MI_REGION_SIZE, region->arena_memid, stats);
+          _mi_arena_free(start, MI_REGION_SIZE, regions[i].arena_memid, stats);
         }
         // and release
-        mi_atomic_write(&region->info,0);
-        mi_atomic_write(&region->map,0);
+        mi_atomic_write(&regions[i].info,0);
+        mi_atomic_write(&regions_dirty[i],0);
+        mi_atomic_write(&regions_map[i],0);
       }
     }
   }

From b09282bc0d6e3228c556eac833331438dbe774be Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Wed, 6 Nov 2019 22:49:01 -0800
Subject: [PATCH 03/12] change arena allocator to atomic bitmap as well

---
 include/mimalloc.h |   4 +-
 src/arena.c        | 268 +++++++++++++--------------------------------
 src/bitmap.inc.c   |   6 +-
 src/init.c         |   4 +-
 src/os.c           |  20 ++--
 5 files changed, 94 insertions(+), 208 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index c03ddc1e..70b6e412 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -230,8 +230,8 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
 
-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept;
-mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
 // deprecated
 mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
diff --git a/src/arena.c b/src/arena.c
index e58d2c47..b807cd47 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -7,15 +7,19 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 16MiB). Currently only used to
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to
 allocate in one arena consisting of huge OS pages -- otherwise it 
 delegates to direct allocation from the OS.
 
 In the future, we can expose an API to manually add more arenas which
 is sometimes needed for embedded devices or shared memory for example.
 
-The arena allocation needs to be thread safe and we use a lock-free scan
-with on-demand coalescing.
+The arena allocation needs to be thread safe and we use an atomic
+bitmap to allocate. The current implementation of the bitmap can
+only do this within a field (`uintptr_t`) so we can allocate at most
+blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
+can lead to fragmentation but fortunately most objects will be regions
+of 256MiB in practice.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
@@ -23,6 +27,8 @@ with on-demand coalescing.
 
 #include <string.h>  // memset
 
+#include "bitmap.inc.c"  // atomic bitmap
+
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
@@ -36,9 +42,11 @@ int   _mi_os_numa_node_count(void);
   Arena allocation
 ----------------------------------------------------------- */
 
-#define MI_SEGMENT_ALIGN     MI_SEGMENT_SIZE
-#define MI_ARENA_BLOCK_SIZE  (4*MI_SEGMENT_ALIGN)  // 16MiB
-#define MI_MAX_ARENAS        (64)
+#define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
+#define MI_ARENA_BLOCK_SIZE   (8*MI_SEGMENT_ALIGN)     // 32MiB
+#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_FIELD_BITS * MI_ARENA_BLOCK_SIZE)  // 2GiB
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
 
 // Block info: bit 0 contains the `in_use` bit, the upper bits the
 // size in count of arena blocks.
@@ -48,11 +56,13 @@ typedef uintptr_t mi_block_info_t;
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields
   int      numa_node;                     // associated NUMA node
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_large;                      // large OS page allocated
-  _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
-  _Atomic(mi_block_info_t) blocks[1];     // `block_count` block info's
+  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];        // bitmap of in-use blocks 
 } mi_arena_t;
 
 
@@ -69,180 +79,55 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0
 // Use `0` as a special id for direct OS allocated memory.
 #define MI_MEMID_OS   0
 
-static size_t mi_memid_create(size_t arena_index, size_t block_index) {
+static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
   mi_assert_internal(arena_index < 0xFE);
-  return ((block_index << 8) | ((arena_index+1) & 0xFF));
+  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }
 
-static void mi_memid_indices(size_t memid, size_t* arena_index, size_t* block_index) {
+static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
   mi_assert_internal(memid != MI_MEMID_OS);
   *arena_index = (memid & 0xFF) - 1;
-  *block_index = (memid >> 8);
+  *bitmap_index = (memid >> 8);
 }
 
-/* -----------------------------------------------------------
-  Block info
------------------------------------------------------------ */
 
-static bool mi_block_is_in_use(mi_block_info_t info) {
-  return ((info&1) != 0);
+static size_t mi_arena_block_count_of_size(size_t size) {
+  const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
+  const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
+  return bcount;
 }
 
-static size_t mi_block_count(mi_block_info_t info) {
-  return (info>>1);
-}
-
-static mi_block_info_t mi_block_info_create(size_t bcount, bool in_use) {
-  return (((mi_block_info_t)bcount << 1) | (in_use ? 1 : 0));
-}
-
-
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-
-static void* mi_arena_allocx(mi_arena_t* arena, size_t start_idx, size_t end_idx, size_t needed_bcount, bool* is_zero, size_t* block_index)
+static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) 
 {
-  // Scan linearly through all block info's
-  // Skipping used ranges, coalescing free ranges on demand.
-  mi_assert_internal(needed_bcount > 0);
-  mi_assert_internal(start_idx <= arena->block_count);
-  mi_assert_internal(end_idx <= arena->block_count);
-  _Atomic(mi_block_info_t)* block = &arena->blocks[start_idx];
-  _Atomic(mi_block_info_t)* end = &arena->blocks[end_idx];
-  while (block < end) {
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    size_t bcount = mi_block_count(binfo);
-    if (mi_block_is_in_use(binfo)) {
-      // in-use, skip ahead
-      mi_assert_internal(bcount > 0);
-      block += bcount;
-    }
-    else {
-      // free blocks
-      if (bcount==0) {
-        // optimization:
-        // use 0 initialized blocks at the end, to use single atomic operation
-        // initially to reduce contention (as we don't need to split)
-        if (block + needed_bcount > end) {
-          return NULL; // does not fit
-        }
-        else if (!mi_atomic_cas_weak(block, mi_block_info_create(needed_bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // we got it: return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = arena->is_zero_init;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
-
-      mi_assert_internal(bcount>0);
-      if (needed_bcount > bcount) {
-#if 0 // MI_NO_ARENA_COALESCE
-        block += bcount; // too small, skip to the next range
-        continue;
-#else
-        // too small, try to coalesce
-        _Atomic(mi_block_info_t)* block_next = block + bcount;
-        if (block_next >= end) {
-          return NULL; // does not fit
-        }
-        mi_block_info_t binfo_next = mi_atomic_read(block_next);
-        size_t bcount_next = mi_block_count(binfo_next);
-        if (mi_block_is_in_use(binfo_next)) {
-          // next block is in use, cannot coalesce
-          block += (bcount + bcount_next); // skip ahea over both blocks
-        }
-        else {
-          // next block is free, try to coalesce
-          // first set the next one to being used to prevent dangling ranges
-          if (!mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, true), binfo_next)) {
-            // someone else got in before us.. try again
-            continue;
-          }
-          else {
-            if (!mi_atomic_cas_strong(block, mi_block_info_create(bcount + bcount_next, true), binfo)) {  // use strong to increase success chance
-              // someone claimed/coalesced the block in the meantime
-              // first free the next block again..
-              bool ok = mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, false), binfo_next); // must be strong
-              mi_assert(ok); UNUSED(ok);
-              // and try again
-              continue;
-            }
-            else {
-              // coalesced! try again
-              // todo: we could optimize here to immediately claim the block if the
-              // coalesced size is a fit instead of retrying. Keep it simple for now.
-              continue;
-            }
-          }
-        }
-#endif
-      }
-      else {  // needed_bcount <= bcount
-        mi_assert_internal(needed_bcount <= bcount);
-        // it fits, claim the whole block
-        if (!mi_atomic_cas_weak(block, mi_block_info_create(bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // got it, now split off the needed part
-          if (needed_bcount < bcount) {
-            mi_atomic_write(block + needed_bcount, mi_block_info_create(bcount - needed_bcount, false));
-            mi_atomic_write(block, mi_block_info_create(needed_bcount, true));
-          }
-          // return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = false;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
+  const size_t fcount = arena->field_count;
+  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
+  for (size_t visited = 0; visited < fcount; visited++, idx++) {
+    if (idx >= fcount) idx = 0;  // wrap around
+    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
+      // claimed it! set the dirty bits
+      *is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
+      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
+      return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE);
     }
   }
-  // no success
   return NULL;
 }
 
-// Try to reduce search time by starting from bottom and wrap around.
-static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_zero, size_t* block_index)
-{
-  uintptr_t bottom = mi_atomic_read_relaxed(&arena->block_bottom);
-  void* p = mi_arena_allocx(arena, bottom, arena->block_count, needed_bcount, is_zero, block_index);
-  if (p == NULL && bottom > 0) {
-    // try again from the start
-    p = mi_arena_allocx(arena, 0, bottom, needed_bcount, is_zero, block_index);
-  }
-  if (p != NULL) {
-    mi_atomic_write(&arena->block_bottom, *block_index);
-  }
-  return p;
-}
 
 /* -----------------------------------------------------------
   Arena Allocation
 ----------------------------------------------------------- */
 
 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
-                                    bool* commit, bool* large, bool* is_zero,
-                                    size_t* memid) 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
-  size_t block_index = SIZE_MAX;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
+  mi_bitmap_index_t bitmap_index;
+  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index);
   if (p != NULL) {
-    mi_assert_internal(block_index != SIZE_MAX);
-    #if MI_DEBUG>=1
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
-    mi_block_info_t binfo = mi_atomic_read(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
-    #endif
-    *memid = mi_memid_create(arena_index, block_index);
+    *memid = mi_memid_create(arena_index, bitmap_index);
     *commit = true;           // TODO: support commit on demand?
     *large = arena->is_large;
   }
@@ -261,15 +146,13 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
   if (large==NULL) large = &default_large;  // ensure `large != NULL`
 
   // try to allocate in an arena if the alignment is small enough
-  // and if there is not too much waste around the `MI_ARENA_BLOCK_SIZE`.
-  if (alignment <= MI_SEGMENT_ALIGN &&
-      size >= 3*(MI_ARENA_BLOCK_SIZE/4) &&  // > 12MiB (not more than 25% waste)
-      !(size > MI_ARENA_BLOCK_SIZE && size < 3*(MI_ARENA_BLOCK_SIZE/2)) // ! <16MiB - 24MiB>
-     )
+  // and the object is not too large or too small.
+  if (alignment <= MI_SEGMENT_ALIGN && 
+      size <= MI_ARENA_MAX_OBJ_SIZE && 
+      size >= MI_ARENA_MIN_OBJ_SIZE)
   {
-    size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-    size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-    int numa_node = _mi_os_numa_node(tld); // current numa node
+    const size_t bcount = mi_arena_block_count_of_size(size);
+    const int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
     // try numa affine allocation
@@ -324,8 +207,8 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
   else {
     // allocated in an arena
     size_t arena_idx;
-    size_t block_idx;
-    mi_memid_indices(memid, &arena_idx, &block_idx);
+    size_t bitmap_idx;
+    mi_memid_indices(memid, &arena_idx, &bitmap_idx);
     mi_assert_internal(arena_idx < MI_MAX_ARENAS);
     mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
     mi_assert_internal(arena != NULL);
@@ -333,27 +216,17 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
       _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    mi_assert_internal(arena->block_count > block_idx);
-    if (arena->block_count <= block_idx) {
-      _mi_fatal_error("trying to free from non-existent block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_idx];
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
-    if (!mi_block_is_in_use(binfo)) {
+    const size_t blocks = mi_arena_block_count_of_size(size);
+    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
+    if (!ones) {
       _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
       return;
     };
-    bool ok = mi_atomic_cas_strong(block, mi_block_info_create(mi_block_count(binfo), false), binfo);
-    mi_assert_internal(ok);
-    if (!ok) {
-      _mi_warning_message("unable to free arena block: %p, info 0x%zx", p, binfo);
-    }
-    if (block_idx < mi_atomic_read_relaxed(&arena->block_bottom)) {
-      mi_atomic_write(&arena->block_bottom, block_idx);
-    }
   }
 }
 
@@ -365,8 +238,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
-  mi_assert_internal(mi_mem_is_zero(arena->blocks,arena->block_count*sizeof(mi_block_info_t)));
-
+  
   uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
   if (i >= MI_MAX_ARENAS) {
     mi_atomic_subu(&mi_arena_count, 1);
@@ -383,40 +255,49 @@ static bool mi_arena_add(mi_arena_t* arena) {
 #include <errno.h> // ENOMEM
 
 // reserve at a specific numa node
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
   size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, pages*500, &pages_reserved, &hsize);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
   if (p==NULL || pages_reserved==0) {
     _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
     return ENOMEM;
   }
   _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
   
-  size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
-  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
+  size_t bcount = mi_arena_block_count_of_size(hsize);
+  size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
+  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
   mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) {
     _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
     return ENOMEM;
   }
   arena->block_count = bcount;
-  arena->start = (uint8_t*)p;
-  arena->block_bottom = 0;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)p;  
   arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large = true;
   arena->is_zero_init = true;
-  memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
+  arena->search_idx = 0;
+  arena->blocks_dirty = &arena->blocks_map[bcount];
+  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    mi_bitmap_claim(arena->blocks_map, fields, post, postidx); 
+  }
+  
   mi_arena_add(arena);
   return 0;
 }
 
 
 // reserve huge pages evenly among all numa nodes. 
-int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept {
   if (pages == 0) return 0;
 
   // pages per numa node
@@ -424,12 +305,13 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
   if (numa_count <= 0) numa_count = 1;
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs / numa_count) + 50;
   
   // reserve evenly among numa nodes
   for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
     if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node);
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
@@ -446,7 +328,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   UNUSED(max_secs);
   _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
   if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages);  
+  int err = mi_reserve_huge_os_pages_interleave(pages, (size_t)(max_secs * 1000.0));  
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 5bea4748..aeb185d1 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -135,13 +135,15 @@ static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields,
 }
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
-static inline void mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+// Returns `true` if all `count` bits were 1 previously
+static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   mi_assert_internal((bitmap[idx] & mask) == mask);
-  mi_atomic_and(&bitmap[idx], ~mask);
+  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
 }
 
 
diff --git a/src/init.c b/src/init.c
index ef848de4..f6d253f9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -433,8 +433,8 @@ static void mi_process_load(void) {
   }
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);    
-    mi_reserve_huge_os_pages_interleave(pages);
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);    
+    mi_reserve_huge_os_pages_interleave(pages, pages*500);
   }
 }
 
diff --git a/src/os.c b/src/os.c
index 254f85f1..027df6ab 100644
--- a/src/os.c
+++ b/src/os.c
@@ -940,16 +940,18 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
     
     // check for timeout
-    mi_msecs_t elapsed = _mi_clock_end(start_t);
-    if (page >= 1) {
-      mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
-      if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
-        elapsed = max_msecs + 1; 
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
       }
-    }
-    if (elapsed > max_msecs) {
-      _mi_warning_message("huge page allocation timed out\n");
-      break;
     }
   }
   mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);

From 378716c46724d839411166a0bba68b0722cf9d8b Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:26:52 -0800
Subject: [PATCH 04/12] refactor and improve atomic bitmap usage

---
 CMakeLists.txt                       |  12 ++-
 ide/vs2019/mimalloc-override.vcxproj |   3 +
 ide/vs2019/mimalloc.vcxproj          |   4 +-
 include/mimalloc-internal.h          |  11 ++-
 include/mimalloc-types.h             |  10 +--
 src/arena.c                          |  62 +++++++--------
 src/bitmap.inc.c                     | 110 ++++++++++++++++++---------
 src/memory.c                         |  96 +++++++++++------------
 src/page.c                           |   2 +
 test/test-stress.c                   |   4 +-
 10 files changed, 183 insertions(+), 131 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12540f68..0726c601 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SECURE_FULL       "Use full security mitigations (like double free protection, more expensive)" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 
@@ -70,9 +71,14 @@ if(MI_OVERRIDE MATCHES "ON")
   endif()
 endif()
 
-if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=3)
+if(MI_SECURE_FULL MATCHES "ON")
+  message(STATUS "Set full secure build (experimental) (MI_SECURE_FULL=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
+else()
+  if(MI_SECURE MATCHES "ON")
+    message(STATUS "Set secure build (MI_SECURE=ON)")
+    list(APPEND mi_defines MI_SECURE=3)
+  endif()
 endif()
 
 if(MI_SEE_ASM MATCHES "ON")
diff --git a/ide/vs2019/mimalloc-override.vcxproj b/ide/vs2019/mimalloc-override.vcxproj
index e1c7535c..49f3d213 100644
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@@ -232,6 +232,9 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/ide/vs2019/mimalloc.vcxproj b/ide/vs2019/mimalloc.vcxproj
index 19696c10..bae49bab 100644
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@@ -218,7 +218,9 @@
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
     <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.inc.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\memory.c" />
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 413f76e6..4d8b6a77 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -163,7 +163,6 @@ bool        _mi_page_is_valid(mi_page_t* page);
 
 
 // Overflow detecting multiply
-#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
@@ -175,6 +174,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
   return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
   *total = count * size;
   return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
           && size > 0 && (SIZE_MAX / size) < count);
@@ -188,6 +188,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {
 
 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
   uintptr_t mask = alignment - 1;
   if ((alignment & mask) == 0) {  // power of two?
     return ((sz + mask) & ~mask);
@@ -197,6 +198,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
@@ -283,7 +290,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
   // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
   ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
   uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
   mi_assert_internal(idx < segment->capacity);
   mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..ced8e7a9 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 4  // experimental, may be more expensive: checks for double free.
 
 #if !defined(MI_SECURE)
-#define MI_SECURE 0
+#define MI_SECURE 4
 #endif
 
 // Define MI_DEBUG for debug mode
@@ -93,12 +93,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb
 
 // Derived constants
-#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)
 
-#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)
 
 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
diff --git a/src/arena.c b/src/arena.c
index b807cd47..8feec89f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -7,12 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). Currently only used to
-allocate in one arena consisting of huge OS pages -- otherwise it 
-delegates to direct allocation from the OS.
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). 
+In contrast to the rest of mimalloc, the arenas are shared between 
+threads and need to be accessed using atomic operations.
 
-In the future, we can expose an API to manually add more arenas which
-is sometimes needed for embedded devices or shared memory for example.
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas 
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
 
 The arena allocation needs to be thread safe and we use an atomic
 bitmap to allocate. The current implementation of the bitmap can
@@ -48,10 +52,6 @@ int   _mi_os_numa_node_count(void);
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
 #define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
 
-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
   uint8_t* start;                         // the start of the memory area
@@ -61,8 +61,8 @@ typedef struct mi_arena_s {
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_large;                      // large OS page allocated
   volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t  blocks_map[1];        // bitmap of in-use blocks 
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
 } mi_arena_t;
 
 
@@ -81,6 +81,7 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0
 
 static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
   mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
   return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }
 
@@ -90,30 +91,25 @@ static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_
   *bitmap_index = (memid >> 8);
 }
 
-
-static size_t mi_arena_block_count_of_size(size_t size) {
-  const size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-  const size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-  return bcount;
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-static void* mi_arena_alloc(mi_arena_t* arena, size_t blocks, bool* is_zero, mi_bitmap_index_t* bitmap_idx) 
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) 
 {
   const size_t fcount = arena->field_count;
   size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
   for (size_t visited = 0; visited < fcount; visited++, idx++) {
     if (idx >= fcount) idx = 0;  // wrap around
     if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
-      // claimed it! set the dirty bits
-      *is_zero = mi_bitmap_claim(arena->blocks_dirty, fcount, blocks, *bitmap_idx);
       mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
-      return (arena->start + (*bitmap_idx)*MI_ARENA_BLOCK_SIZE);
+      return true;
     }
   }
-  return NULL;
+  return false;
 }
 
 
@@ -125,13 +121,15 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
                                  bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
   mi_bitmap_index_t bitmap_index;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &bitmap_index);
-  if (p != NULL) {
-    *memid = mi_memid_create(arena_index, bitmap_index);
-    *commit = true;           // TODO: support commit on demand?
-    *large = arena->is_large;
+  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
+    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
+    *memid   = mi_memid_create(arena_index, bitmap_index);
+    *commit  = true;           // TODO: support commit on demand?
+    *large   = arena->is_large;
+    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
   }
-  return p;
+  return NULL;
 }
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
@@ -140,7 +138,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 {
   mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
   *is_zero = false;
   bool default_large = false;
   if (large==NULL) large = &default_large;  // ensure `large != NULL`
@@ -151,7 +149,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
       size <= MI_ARENA_MAX_OBJ_SIZE && 
       size >= MI_ARENA_MIN_OBJ_SIZE)
   {
-    const size_t bcount = mi_arena_block_count_of_size(size);
+    const size_t bcount = mi_block_count_of_size(size);
     const int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
@@ -221,7 +219,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
       _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
-    const size_t blocks = mi_arena_block_count_of_size(size);
+    const size_t blocks = mi_block_count_of_size(size);
     bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
     if (!ones) {
       _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
@@ -268,7 +266,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   }
   _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
   
-  size_t bcount = mi_arena_block_count_of_size(hsize);
+  size_t bcount = mi_block_count_of_size(hsize);
   size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
   size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
   mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
@@ -284,6 +282,8 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   arena->is_zero_init = true;
   arena->search_idx = 0;
   arena->blocks_dirty = &arena->blocks_map[bcount];
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
   size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   if (post > 0) {
     // don't use leftover bits at the end
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index aeb185d1..19e6bbb8 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -1,41 +1,30 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This file is meant to be included in other files for efficiency.
+It implements a bitmap that can set/reset sequences of bits atomically
+and is used to concurrently claim memory ranges. 
+
+A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
+
+A current limitation is that the bit sequences cannot cross fields 
+and that the sequence must be smaller or equal to the bits in a field.
+---------------------------------------------------------------------------- */
 #pragma once
-#ifndef MI_BITMAP_H
-#define MI_BITMAP_H
+#ifndef MI_BITMAP_C
+#define MI_BITMAP_C
 
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 
-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanForward)(&idx, x);
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanReverse)(&idx, x);
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-#if (INTPTR_MAX == LONG_MAX)
-# define MI_L(x)  x##l
-#else
-# define MI_L(x)  x##ll
-#endif
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
-}
-#endif
-
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
 
 #define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
 #define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
@@ -63,14 +52,59 @@ static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx)
   return (bitmap_idx % MI_BITMAP_FIELD_BITS);
 }
 
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
   return ((((uintptr_t)1 << count) - 1) << bitidx);
 }
 
-// Try to atomically claim a sequence of `count` bits in a single field at `idx` in `bitmap`.
-// Returns `true` on success.
+
+/* -----------------------------------------------------------
+  Use bit scan forward/reverse to quickly find the first zero bit if it is available
+----------------------------------------------------------- */
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#include <limits.h> // LONG_MAX
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single 
+// field at `idx` in `bitmap`. Returns `true` on success.
 static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
 {  
   mi_assert_internal(bitmap_idx != NULL);
@@ -93,7 +127,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
   while (bitidx <= bitidx_max) {
     if ((map & m) == 0) {  // are the mask bits free at bitidx?
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
+      const uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
       if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
         // no success, another thread claimed concurrently.. keep going
@@ -109,10 +143,10 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, con
     else {
       // on to the next bit range
 #ifdef MI_HAVE_BITSCAN
-      size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
       mi_assert_internal(shift > 0 && shift <= count);
 #else
-      size_t shift = 1;
+      const size_t shift = 1;
 #endif
       bitidx += shift;
       m <<= shift;
diff --git a/src/memory.c b/src/memory.c
index 29e0e412..bdbf1e48 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -16,10 +16,10 @@ We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
    to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses in that
-   object :-( (`malloc-large` tests this). This means we need a cheaper way to
-   reuse memory.
-3. This layer can help with a NUMA aware allocation in the future.
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.
 
 Possible issues:
 - (2) can potentially be addressed too with a small cache per thread which is much
@@ -47,8 +47,6 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-//void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-//void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
 
 // arena.c
 void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
@@ -58,18 +56,18 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 40KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
 #define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
 
 
 // Region info is a pointer to the memory region and two bits for 
@@ -95,7 +93,7 @@ typedef struct mem_region_s {
   size_t   arena_memid;                          // if allocated from a (huge page) arena
 } mem_region_t;
 
-// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
+// The region map
 static mem_region_t regions[MI_REGION_MAX];
 
 // A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
@@ -173,7 +171,7 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
     bool region_large = allow_large;
     bool is_zero = false;
     size_t arena_memid = 0;
-    void* start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+    void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
     mi_assert_internal(!(region_large && !allow_large));
 
     if (start == NULL) {
@@ -183,35 +181,31 @@ static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_i
     }
 
     // set the newly allocated region
+    // try to initialize any region up to 4 beyond the current one in
+    // care multiple threads are doing this concurrently (common at startup)    
     info = mi_region_info_create(start, region_large, region_commit);
-    if (mi_atomic_cas_strong(&regions[idx].info, info, 0)) {
-      // update the region count
-      regions[idx].arena_memid = arena_memid;
-      mi_atomic_write(&regions[idx].numa_node, _mi_os_numa_node(tld) + 1);
-      mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-      mi_atomic_increment(&regions_count);
-    }
-    else {
-      // failed, another thread allocated just before us!
-      // we assign it to a later slot instead (up to 4 tries).
-      for (size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-          regions[idx+i].arena_memid = arena_memid;
-          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
-          mi_atomic_write(&regions_dirty[idx], is_zero ? 0 : ~((uintptr_t)0));
-          mi_atomic_increment(&regions_count);
-          start = NULL;
-          break;
-        }
+    bool claimed = false;
+    for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
+      if (!is_zero) {
+        // set dirty bits before CAS; this might race with a zero block but that is ok. 
+        // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
+        mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
       }
-      if (start != NULL) {
-        // free it if we didn't succeed to save it to some other region
-        _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
-        // _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
+      if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
+        // claimed!
+        regions[idx+i].arena_memid = arena_memid;
+        mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
+        mi_atomic_increment(&regions_count);
+        claimed = true;
       }
-      // and continue with the memory at our index
-      info = mi_atomic_read(&regions[idx].info);
     }
+    if (!claimed) {
+      // free our OS allocation if we didn't succeed to store it in some region
+      _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);      
+    }
+    // continue with the actual info at our index in case another thread was quicker with the allocation
+    info = mi_atomic_read(&regions[idx].info);
+    mi_assert_internal(info != 0);
   }
   mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
   mi_assert_internal(info != 0);
@@ -290,19 +284,21 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
     int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
     if (rnode != numa_node) return false;
   }
-  if (mi_unlikely(!(commit || allow_large))) {
-    // otherwise skip incompatible regions if possible. 
-    // this is not guaranteed due to multiple threads allocating at the same time but
-    // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-    // otherwise we might just not be able to reset/decommit individual pages sometimes.
-    mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
-    bool is_large;
-    bool is_committed;
-    void* start = mi_region_info_read(info, &is_large, &is_committed);
-    bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
-    if (!ok) return false;
-  }
-  return true;
+  if (commit && allow_large) return true;  // always ok
+
+  // otherwise skip incompatible regions if possible. 
+  // this is not guaranteed due to multiple threads allocating at the same time but
+  // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+  // otherwise we might just not be able to reset/decommit individual pages sometimes.
+  mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+  bool is_large;
+  bool is_committed;
+  void* start = mi_region_info_read(info, &is_large, &is_committed);
+  // note: we also skip if commit is false and the region is committed,
+  // that is a bit strong but prevents allocation of eager delayed segments in 
+  // committed memory
+  bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
+  return ok;
 }
 
 // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
diff --git a/src/page.c b/src/page.c
index 32b68edb..c5b6e370 100644
--- a/src/page.c
+++ b/src/page.c
@@ -497,8 +497,10 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
 {
   UNUSED(stats);
+  #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
   size_t bsize = page->block_size;
diff --git a/test/test-stress.c b/test/test-stress.c
index bb428072..d80cb1a4 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -66,7 +66,9 @@ static void* alloc_items(size_t items, random_t r) {
   if (chance(1, r)) items *= 100; // 1% huge objects;
   if (items==40) items++;              // pthreads uses that size for stack increases
   uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
-  for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  if (p != NULL) {
+    for (uintptr_t i = 0; i < items; i++) p[i] = (items - i) ^ cookie;
+  }
   return p;
 }
 

From 31d11f64d581abfd28818be65f3780506977d889 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:33:45 -0800
Subject: [PATCH 05/12] fix secure free list extension where a non-empty
 initial free list was discarded

---
 include/mimalloc-types.h |  4 ++--
 src/page.c               | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..893dcd67 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -26,7 +26,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 1  // guard page around metadata
 // #define MI_SECURE 2  // guard page around each mimalloc page
 // #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // experimental, may be more expensive: checks for double free.
+// #define MI_SECURE 4  // experimental, may be more expensive: checks for double free. (cmake -DMI_SECURE_FULL=ON)
 
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
@@ -35,7 +35,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_DEBUG for debug mode
 // #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
 // #define MI_DEBUG 2  // + internal assertion checks
-// #define MI_DEBUG 3  // + extensive internal invariant checking
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_CHECK_FULL=ON)
 #if !defined(MI_DEBUG)
 #if !defined(NDEBUG) || defined(_DEBUG)
 #define MI_DEBUG 2
diff --git a/src/page.c b/src/page.c
index f7fad764..cb3a4bf8 100644
--- a/src/page.c
+++ b/src/page.c
@@ -455,8 +455,8 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   while ((extend >> shift) == 0) {
     shift--;
   }
-  size_t slice_count = (size_t)1U << shift;
-  size_t slice_extend = extend / slice_count;
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
   mi_assert_internal(slice_extend >= 1);
   mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
   size_t      counts[MI_MAX_SLICES];   // available objects in the slice
@@ -470,7 +470,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   // set up first element
   size_t current = _mi_heap_random(heap) % slice_count;
   counts[current]--;
-  page->free = blocks[current];
+  mi_block_t* const free_start = blocks[current];
   // and iterate through the rest
   uintptr_t rnd = heap->random;
   for (size_t i = 1; i < extend; i++) {
@@ -490,7 +490,9 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
     mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
     current = next;
   }
-  mi_block_set_next(page, blocks[current], NULL);             // end of the list
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
   heap->random = _mi_random_shuffle(rnd);
 }
 

From 27f1a8b3d24acf0ff0bcbdacfbecd21437fb450e Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:35:30 -0800
Subject: [PATCH 06/12] fix avg display; set secure default to 0`

---
 include/mimalloc-types.h | 2 +-
 src/stats.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index ddbe72f3..3f5e4e27 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -29,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // #define MI_SECURE 4  // experimental, may be more expensive: checks for double free. (cmake -DMI_SECURE_FULL=ON)
 
 #if !defined(MI_SECURE)
-#define MI_SECURE 4
+#define MI_SECURE 0
 #endif
 
 // Define MI_DEBUG for debug mode
diff --git a/src/stats.c b/src/stats.c
index 011fab64..cb6d8866 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -206,7 +206,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
   const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, "%10s: %5ld.%ld avg %ld %ld\n", msg, avg_whole, avg_frac1);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }
 
 

From 9b6538880768ccbe0dde86cfc0018a7b035e7911 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:59:19 -0800
Subject: [PATCH 07/12] fix space leak in  secure mode where a non-null free
 list would be discarded

---
 src/page.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/page.c b/src/page.c
index cb3a4bf8..aaf1cb91 100644
--- a/src/page.c
+++ b/src/page.c
@@ -439,15 +439,15 @@ void _mi_page_retire(mi_page_t* page) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) {
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
   UNUSED(stats);
   #if (MI_SECURE<=2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
-  size_t bsize = page->block_size;
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  const size_t bsize = page->block_size;
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -475,7 +475,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   uintptr_t rnd = heap->random;
   for (size_t i = 1; i < extend; i++) {
     // call random_shuffle only every INTPTR_SIZE rounds
-    size_t round = i%MI_INTPTR_SIZE;
+    const size_t round = i%MI_INTPTR_SIZE;
     if (round == 0) rnd = _mi_random_shuffle(rnd);
     // select a random next slice index
     size_t next = ((rnd >> 8*round) & (slice_count-1));
@@ -485,7 +485,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
     }
     // and link the current block to it
     counts[next]--;
-    mi_block_t* block = blocks[current];
+    mi_block_t* const block = blocks[current];
     blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
     mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
     current = next;
@@ -496,25 +496,28 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
   heap->random = _mi_random_shuffle(rnd);
 }
 
-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
 {
   UNUSED(stats);
+  #if (MI_SECURE <= 2)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
+  #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
-  size_t bsize = page->block_size;
-  mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  const size_t bsize = page->block_size;
+  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
   
   // initialize a sequential free list
-  mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
+  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
   mi_block_t* block = start;
   while(block <= last) {
     mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
     mi_block_set_next(page,block,next);
     block = next;
   }  
-  mi_block_set_next(page, last, NULL);
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
   page->free = start;
 }
 

From 56887aeb2f75d0ade86120e448e66a2684c920ff Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 10:59:45 -0800
Subject: [PATCH 08/12] add MI_SECURE_FULL=ON as a cmake option to include
 double free mitigation

---
 CMakeLists.txt | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81cc339a..59d889b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library" OFF)
 option(MI_SECURE            "Use security mitigations (like guard pages and randomization)" OFF)
+option(MI_SECURE_FULL       "Use full security mitigations, may be more expensive (includes double-free mitigation)" OFF)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 
@@ -66,9 +67,15 @@ if(MI_OVERRIDE MATCHES "ON")
   endif()
 endif()
 
-if(MI_SECURE MATCHES "ON")
-  message(STATUS "Set secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=3)
+if(MI_SECURE_FULL MATCHES "ON")
+  message(STATUS "Set full secure build (may be more expensive) (MI_SECURE_FULL=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
+  set(MI_SECURE "ON")
+else()
+  if(MI_SECURE MATCHES "ON")
+    message(STATUS "Set secure build (MI_SECURE=ON)")
+    list(APPEND mi_defines MI_SECURE=3)
+  endif()
 endif()
 
 if(MI_SEE_ASM MATCHES "ON")

From 13f5e6e43e9aae4043d9acc94fac67746fcd9bb4 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 7 Nov 2019 18:09:30 -0800
Subject: [PATCH 09/12] fix numa node check in regions

---
 src/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory.c b/src/memory.c
index bdbf1e48..fb3f5093 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -282,7 +282,7 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   if (m == MI_BITMAP_FIELD_FULL) return false;
   if (numa_node >= 0) {  // use negative numa node to always succeed
     int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
-    if (rnode != numa_node) return false;
+    if (rnode >= 0 && rnode != numa_node) return false;
   }
   if (commit && allow_large) return true;  // always ok
 

From 7b72a4cd50782563104e28becb7e181e8978449f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 8 Nov 2019 11:55:43 -0800
Subject: [PATCH 10/12] fix region suitable bug

---
 src/memory.c       | 6 +++---
 test/test-stress.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index bdbf1e48..f8798d99 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -281,8 +281,8 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
   if (m == MI_BITMAP_FIELD_FULL) return false;
   if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = ((int)mi_atomic_read_relaxed(&regions->numa_node)) - 1;
-    if (rnode != numa_node) return false;
+    int rnode = ((int)mi_atomic_read_relaxed(&regions[idx].numa_node)) - 1;
+    if (rnode >= 0 && rnode != numa_node) return false;
   }
   if (commit && allow_large) return true;  // always ok
 
@@ -290,7 +290,7 @@ static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool a
   // this is not guaranteed due to multiple threads allocating at the same time but
   // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
   // otherwise we might just not be able to reset/decommit individual pages sometimes.
-  mi_region_info_t info = mi_atomic_read_relaxed(&regions->info);
+  mi_region_info_t info = mi_atomic_read_relaxed(&regions[idx].info);
   bool is_large;
   bool is_committed;
   void* start = mi_region_info_read(info, &is_large, &is_committed);
diff --git a/test/test-stress.c b/test/test-stress.c
index d80cb1a4..be2a9c67 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -18,7 +18,7 @@ terms of the MIT license.
 
 // argument defaults
 static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 20;    // scaling factor
+static int N       = 40;    // scaling factor
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int N       = 100;  // scaling factor

From 9f08ddd0d0d2909998d71bf6da9bce2b048d851e Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 9 Nov 2019 19:30:53 -0800
Subject: [PATCH 11/12] refactor regions; add commit tracking on a segment
 basis

---
 src/arena.c      |   9 +-
 src/bitmap.inc.c |  14 +-
 src/memory.c     | 382 ++++++++++++++++++++---------------------------
 src/segment.c    |   2 +-
 4 files changed, 181 insertions(+), 226 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 8feec89f..1b6cf4a4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -123,7 +123,7 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
   mi_bitmap_index_t bitmap_index;
   if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
     // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index);
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
     *memid   = mi_memid_create(arena_index, bitmap_index);
     *commit  = true;           // TODO: support commit on demand?
     *large   = arena->is_large;
@@ -181,7 +181,10 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 
   // finally, fall back to the OS
   *is_zero = true;
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
+  if (*large) {
+    *large = mi_option_is_enabled(mi_option_large_os_pages); // try large OS pages only if enabled and allowed
+  }
   return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
 }
 
@@ -288,7 +291,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
   if (post > 0) {
     // don't use leftover bits at the end
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    mi_bitmap_claim(arena->blocks_map, fields, post, postidx); 
+    mi_bitmap_claim(arena->blocks_map, fields, post, postidx, NULL); 
   }
   
   mi_arena_add(arena);
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 19e6bbb8..3847e712 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -61,6 +61,7 @@ static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
 // The bit mask for a given number of blocks at a specified bit index.
 static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
   mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
   return ((((uintptr_t)1 << count) - 1) << bitidx);
 }
 
@@ -183,14 +184,25 @@ static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, s
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously
-static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   // mi_assert_internal((bitmap[idx] & mask) == 0);
   uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
   return ((prev & mask) == 0);
 }
 
+// Returns `true` if all `count` bits were 1
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
+}
+
 #endif
\ No newline at end of file
diff --git a/src/memory.c b/src/memory.c
index f8798d99..a1f94e18 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -65,10 +65,11 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo
 
 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
 
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
 #define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
-#define MI_REGION_MAX_ALLOC_SIZE  (MI_REGION_SIZE/4)                          // 64MiB
 #define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
-
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
 
 // Region info is a pointer to the memory region and two bits for 
 // its flags: is_large, and is_committed.
@@ -88,20 +89,16 @@ static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, b
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(mi_region_info_t) info;       // start of the memory area (and flags)
-  volatile _Atomic(uintptr_t)        numa_node;  // associated numa node + 1 (so 0 is no association)
-  size_t   arena_memid;                          // if allocated from a (huge page) arena
+  volatile _Atomic(mi_region_info_t) info;        // start of the memory area (and flags)
+  volatile _Atomic(uintptr_t)        numa_node;   // associated numa node + 1 (so 0 is no association)
+  mi_bitmap_field_t                  in_use;
+  mi_bitmap_field_t                  dirty;  
+  size_t                             arena_memid; // if allocated from a (huge page) arena
 } mem_region_t;
 
 // The region map
 static mem_region_t regions[MI_REGION_MAX];
 
-// A bit mask per region for its claimed MI_SEGMENT_SIZE blocks.
-static mi_bitmap_field_t regions_map[MI_REGION_MAX];
-
-// A bit mask per region to track which blocks are dirty (= potentially written to)
-static mi_bitmap_field_t regions_dirty[MI_REGION_MAX];
-
 // Allocated regions
 static volatile _Atomic(uintptr_t) regions_count; // = 0;        
 
@@ -112,8 +109,7 @@ Utility functions
 
 // Blocks (of 4MiB) needed for the given size.
 static size_t mi_region_block_count(size_t size) {
-  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
-  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
 }
 
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
@@ -134,8 +130,11 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 }
 
 
-static size_t mi_memid_create(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx<<1;
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
 }
 
 static size_t mi_memid_create_from_arena(size_t arena_memid) {
@@ -146,177 +145,149 @@ static bool mi_memid_is_arena(size_t id) {
   return ((id&1)==1);
 }
 
-static bool mi_memid_indices(size_t id, mi_bitmap_index_t* bitmap_idx, size_t* arena_memid) {
+static bool mi_memid_indices(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
   if (mi_memid_is_arena(id)) {
     *arena_memid = (id>>1);
     return true;
   }
   else {
-    *bitmap_idx = (mi_bitmap_index_t)(id>>1);
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
     return false;
   }
 }
 
 /* ----------------------------------------------------------------------------
-  Ensure a region is allocated from the OS (or an arena)
+  Allocate a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/
 
-static bool mi_region_ensure_allocated(size_t idx, bool allow_large, mi_region_info_t* pinfo, mi_os_tld_t* tld)
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld) 
 {
-  // ensure the region is reserved
-  mi_region_info_t info = mi_atomic_read(&regions[idx].info);
-  if (mi_unlikely(info == 0))
-  {
-    bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
-    bool region_large = allow_large;
-    bool is_zero = false;
-    size_t arena_memid = 0;
-    void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
-    mi_assert_internal(!(region_large && !allow_large));
+  // not out of regions yet?
+  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
 
-    if (start == NULL) {
-      // failure to allocate from the OS! fail
-      *pinfo = 0;
-      return false;
-    }
-
-    // set the newly allocated region
-    // try to initialize any region up to 4 beyond the current one in
-    // care multiple threads are doing this concurrently (common at startup)    
-    info = mi_region_info_create(start, region_large, region_commit);
-    bool claimed = false;
-    for (size_t i = 0; i <= 4 && idx + i < MI_REGION_MAX && !claimed; i++) {
-      if (!is_zero) {
-        // set dirty bits before CAS; this might race with a zero block but that is ok. 
-        // (but writing before cas prevents a concurrent allocation to assume it is not dirty)
-        mi_atomic_write(&regions_dirty[idx+i], MI_BITMAP_FIELD_FULL);
-      }
-      if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-        // claimed!
-        regions[idx+i].arena_memid = arena_memid;
-        mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
-        mi_atomic_increment(&regions_count);
-        claimed = true;
-      }
-    }
-    if (!claimed) {
-      // free our OS allocation if we didn't succeed to store it in some region
-      _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);      
-    }
-    // continue with the actual info at our index in case another thread was quicker with the allocation
-    info = mi_atomic_read(&regions[idx].info);
-    mi_assert_internal(info != 0);
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large  = (commit && allow_large);  
+  bool is_zero       = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  
+  // claim a fresh slot
+  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+    return false;
   }
-  mi_assert_internal(info == mi_atomic_read(&regions[idx].info));
-  mi_assert_internal(info != 0);
-  *pinfo = info;
+
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->numa_node = _mi_os_numa_node(tld) + 1;
+  r->arena_memid = arena_memid;
+  *bit_idx = 0;
+  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_write(&r->info, mi_region_info_create(start, region_large, region_commit)); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool commit, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
+  if (info==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = ((int)mi_atomic_read_relaxed(&region->numa_node)) - 1;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // note: we also skip if commit is false and the region is committed,
+  // that is a bit strong but prevents allocation of eager-delayed segments in an eagerly committed region
+  bool is_large;
+  bool is_committed;
+  mi_region_info_read(info, &is_large, &is_committed);  
+  
+  if (!commit && is_committed) return false;
+  if (!allow_large && is_large) return false;
   return true;
 }
 
 
-/* ----------------------------------------------------------------------------
-  Commit blocks
------------------------------------------------------------------------------*/
-
-static void* mi_region_commit_blocks(mi_bitmap_index_t bitmap_idx, mi_region_info_t info, size_t blocks, size_t size, bool* commit, bool* is_large, bool* is_zero, mi_os_tld_t* tld)
+static bool mi_region_try_claim(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
-  // set dirty bits
-  *is_zero = mi_bitmap_claim(regions_dirty, MI_REGION_MAX, blocks, bitmap_idx);
+  // try all regions for a free slot
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    if (mi_region_is_suitable(r, numa_node, commit, allow_large)) {
+      if (mi_bitmap_try_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
+      }
+    }
+  }
+  return false;
+}
 
-  // Commit the blocks to memory
+
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  // first try to claim in existing regions
+  if (!mi_region_try_claim(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region
+    if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
+  }
+  
+  // found a region and claimed `blocks` at `bit_idx`
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+
+  mi_region_info_t info = mi_atomic_read(&region->info);
   bool region_is_committed = false;
   bool region_is_large = false;
   void* start = mi_region_info_read(info, &region_is_large, &region_is_committed);
   mi_assert_internal(!(region_is_large && !*is_large));
-  mi_assert_internal(start!=NULL);
+  mi_assert_internal(start != NULL);
 
-  void* blocks_start = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bitmap_idx) * MI_SEGMENT_SIZE);
-  if (*commit && !region_is_committed) {
-    // ensure commit 
-    bool commit_zero = false;
-    _mi_os_commit(blocks_start, mi_good_commit_size(size), &commit_zero, tld->stats);  // only commit needed size (unless using large OS pages)
-    if (commit_zero) *is_zero = true;
-  }
-  else if (!*commit && region_is_committed) {
-    // but even when no commit is requested, we might have committed anyway (in a huge OS page for example)
-    *commit = true;
-  }
-
-  // and return the allocation  
-  mi_assert_internal(blocks_start != NULL);
+  bool any_zero = false;
+  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, &any_zero);
+  if (!mi_option_is_enabled(mi_option_eager_commit)) any_zero = true; // if no eager commit, even dirty segments may be partially committed
   *is_large = region_is_large;
-  return blocks_start;
+  *memid = mi_memid_create(region, bit_idx);
+  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+  if (*commit && !region_is_committed && any_zero) { // want to commit, but not yet fully committed?
+    // ensure commit 
+    _mi_os_commit(p, blocks * MI_SEGMENT_SIZE, is_zero, tld->stats);  
+  }
+  else {
+    *commit = region_is_committed || !any_zero;
+  }
+  
+  
+  // and return the allocation  
+  mi_assert_internal(p != NULL);  
+  return p;
 }
 
-/* ----------------------------------------------------------------------------
-  Claim and allocate blocks in a region
------------------------------------------------------------------------------*/
-
-static bool mi_region_alloc_blocks(
-  size_t idx, size_t blocks, size_t size,
-  bool* commit, bool* allow_large, bool* is_zero,
-  void** p, size_t* id, mi_os_tld_t* tld)
-{
-  mi_bitmap_index_t bitmap_idx;
-  if (!mi_bitmap_try_claim_field(regions_map, idx, blocks, &bitmap_idx)) {
-    return true; // no error, but also no success
-  }
-  mi_region_info_t info;
-  if (!mi_region_ensure_allocated(idx,*allow_large,&info,tld)) {
-    // failed to allocate region memory, unclaim the bits and fail
-    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
-    return false;
-  }
-  *p = mi_region_commit_blocks(bitmap_idx,info,blocks,size,commit,allow_large,is_zero,tld);
-  *id = mi_memid_create(bitmap_idx);
-  return true;
-}
-
-
-/* ----------------------------------------------------------------------------
-  Try to allocate blocks in suitable regions
------------------------------------------------------------------------------*/
-
-static bool mi_region_is_suitable(int numa_node, size_t idx, bool commit, bool allow_large ) {
-  uintptr_t m = mi_atomic_read_relaxed(&regions_map[idx]);
-  if (m == MI_BITMAP_FIELD_FULL) return false;
-  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = ((int)mi_atomic_read_relaxed(&regions[idx].numa_node)) - 1;
-    if (rnode >= 0 && rnode != numa_node) return false;
-  }
-  if (commit && allow_large) return true;  // always ok
-
-  // otherwise skip incompatible regions if possible. 
-  // this is not guaranteed due to multiple threads allocating at the same time but
-  // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-  // otherwise we might just not be able to reset/decommit individual pages sometimes.
-  mi_region_info_t info = mi_atomic_read_relaxed(&regions[idx].info);
-  bool is_large;
-  bool is_committed;
-  void* start = mi_region_info_read(info, &is_large, &is_committed);
-  // note: we also skip if commit is false and the region is committed,
-  // that is a bit strong but prevents allocation of eager delayed segments in 
-  // committed memory
-  bool ok = (start == NULL || (commit || !is_committed) || (allow_large || !is_large)); // Todo: test with one bitmap operation?
-  return ok;
-}
-
-// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(
-  int numa_node, size_t idx, size_t blocks, size_t size,
-  bool* commit, bool* allow_large, bool* is_zero,
-  void** p, size_t* id, mi_os_tld_t* tld)
-{
-  // check if there are available blocks in the region..
-  mi_assert_internal(idx < MI_REGION_MAX);
-  if (mi_region_is_suitable(numa_node, idx, *commit, *allow_large)) {
-    return mi_region_alloc_blocks(idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
-  }
-  return true;  // no error, but no success either
-}
 
 /* ----------------------------------------------------------------------------
  Allocation
@@ -324,63 +295,35 @@ static bool mi_region_try_alloc_blocks(
 
 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, 
-                            size_t* id, mi_os_tld_t* tld)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
-  mi_assert_internal(id != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *id = 0;
+  *memid = 0;
   *is_zero = false;
   bool default_large = false;
   if (large==NULL) large = &default_large;  // ensure `large != NULL`  
-
-  // use direct OS allocation for huge blocks or alignment 
-  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
-    size_t arena_memid = 0;
-    void* p = _mi_arena_alloc_aligned(mi_good_commit_size(size), alignment, commit, large, is_zero, &arena_memid, tld);  // round up size
-    *id = mi_memid_create_from_arena(arena_memid);
-    return p;
-  }
-
-  // always round size to OS page size multiple (so commit/decommit go over the entire range)
-  // TODO: use large OS page size here?
+  if (size == 0) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // calculate the number of needed blocks
+  // allocate from regions if possible
+  size_t arena_memid;
   const size_t blocks = mi_region_block_count(size);
-  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
-
-  // find a range of free blocks
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
-  void* p = NULL;
-  const size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-    if (p != NULL) break;
-  }
-
-  if (p == NULL) {
-    // no free range in existing regions -- try to extend beyond the count.. but at most 8 regions
-    for (idx = count; idx < mi_atomic_read_relaxed(&regions_count) + 8 && idx < MI_REGION_MAX; idx++) {
-      if (!mi_region_try_alloc_blocks(numa_node, idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-      if (p != NULL) break;
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
+    void* p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);
+    mi_assert_internal(p == NULL || (uintptr_t)p % alignment == 0);    
+    if (p != NULL) {
+      if (*commit) { ((uint8_t*)p)[0] = 0; }
+      return p;
     }
+    _mi_warning_message("unable to allocate from region: size %zu\n", size);
   }
 
-  if (p == NULL) {
-    // we could not find a place to allocate, fall back to the os directly
-    _mi_warning_message("unable to allocate from region: size %zu\n", size);    
-    size_t arena_memid = 0;
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
-    *id = mi_memid_create_from_arena(arena_memid);
-  }
-  else {
-    tld->region_idx = idx;  // next start of search
-  }
-
+  // and otherwise fall back to the OS
+  void* p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+  *memid = mi_memid_create_from_arena(arena_memid);
   mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
+  if (p != NULL && *commit) { ((uint8_t*)p)[0] = 0; }
   return p;
 }
 
@@ -396,31 +339,28 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
   if (p==NULL) return;
   if (size==0) return;
   size_t arena_memid = 0;
-  mi_bitmap_index_t bitmap_idx;
-  if (mi_memid_indices(id,&bitmap_idx,&arena_memid)) {
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_indices(id,&region,&bit_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
     _mi_arena_free(p, size, arena_memid, stats);
   }
   else {
     // allocated in a region
-    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
     // we can align the size up to page size (as we allocate that way too)
     // this ensures we fully commit/decommit/reset
     size = _mi_align_up(size, _mi_os_page_size());    
-    const size_t blocks = mi_region_block_count(size);
-    const size_t idx    = mi_bitmap_index_field(bitmap_idx);
-    const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
-    mem_region_t* region = &regions[idx];
+    const size_t blocks = mi_region_block_count(size);    
     mi_region_info_t info = mi_atomic_read(&region->info);
     bool is_large;
     bool is_eager_committed;
     void* start = mi_region_info_read(info,&is_large,&is_eager_committed);
     mi_assert_internal(start != NULL);
-    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+    void* blocks_start = (uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bitidx + blocks <= MI_BITMAP_FIELD_BITS);
-    if (blocks_start != p || bitidx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
 
     // decommit (or reset) the blocks to reduce the working set.
     // TODO: implement delayed decommit/reset as these calls are too expensive
@@ -446,7 +386,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     // this frees up virtual address space which might be useful on 32-bit systems?
 
     // and unclaim
-    mi_bitmap_unclaim(regions_map, MI_REGION_MAX, blocks, bitmap_idx);
+    mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
   }
 }
 
@@ -456,13 +396,15 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 -----------------------------------------------------------------------------*/
 void _mi_mem_collect(mi_stats_t* stats) {
   // free every region that has no segments in use.
-  for (size_t i = 0; i < regions_count; i++) {
-    if (mi_atomic_read_relaxed(&regions_map[i]) == 0) {
+  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_read_relaxed(&region->info) != 0) {
       // if no segments used, try to claim the whole region
       uintptr_t m;
       do {
-        m = mi_atomic_read_relaxed(&regions_map[i]);
-      } while(m == 0 && !mi_atomic_cas_weak(&regions_map[i], MI_BITMAP_FIELD_FULL, 0 ));
+        m = mi_atomic_read_relaxed(&region->in_use);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
       if (m == 0) {
         // on success, free the whole region
         bool is_eager_committed;
@@ -471,9 +413,7 @@ void _mi_mem_collect(mi_stats_t* stats) {
           _mi_arena_free(start, MI_REGION_SIZE, regions[i].arena_memid, stats);
         }
         // and release
-        mi_atomic_write(&regions[i].info,0);
-        mi_atomic_write(&regions_dirty[i],0);
-        mi_atomic_write(&regions_map[i],0);
+        mi_atomic_write(&region->info,0);
       }
     }
   }
diff --git a/src/segment.c b/src/segment.c
index 178e0eda..b2b37fac 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -370,7 +370,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
     }
     segment->memid = memid;
     segment->mem_is_fixed = mem_large;
-    segment->mem_is_committed = commit;
+    segment->mem_is_committed = commit;    
     mi_segments_track_size((long)segment_size, tld);
   }
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);

From d2279b2a3faf7c2e084644449326306ef8d4f619 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 10 Nov 2019 08:13:40 -0800
Subject: [PATCH 12/12] update test-stress with better object distribution

---
 test/test-stress.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index be2a9c67..37572d42 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -17,8 +17,8 @@ terms of the MIT license.
 #include <mimalloc.h>
 
 // argument defaults
-static int THREADS = 32;    // more repeatable if THREADS <= #processors
-static int N       = 40;    // scaling factor
+static int THREADS = 8;    // more repeatable if THREADS <= #processors
+static int N       = 200;    // scaling factor
 
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int N       = 100;  // scaling factor
@@ -63,7 +63,11 @@ static bool chance(size_t perc, random_t r) {
 }
 
 static void* alloc_items(size_t items, random_t r) {
-  if (chance(1, r)) items *= 100; // 1% huge objects;
+  if (chance(1, r)) {
+    if (chance(1, r)) items *= 1000;       // 0.01% giant
+    else if (chance(10, r)) items *= 100;  // 0.1% huge
+    else items *= 10;                      // 1% large objects;
+  }
   if (items==40) items++;              // pthreads uses that size for stack increases
   uintptr_t* p = (uintptr_t*)mi_malloc(items*sizeof(uintptr_t));
   if (p != NULL) {