merge from dev-exp; bitmap based arena

2025-08-25 00:34:48 +03:00 · 2019-11-10 07:56:40 -08:00 · 2019-11-10 07:56:40 -08:00 · fed0068dac
commit fed0068dac
parent 62df2e2df9 194008b869
14 changed files with 420 additions and 268 deletions
--- a/src/arena.c
+++ b/src/arena.c
@ -7,15 +7,23 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_BLOCK_SIZE, 16MiB). Currently only used to
-allocate in one arena consisting of huge OS pages -- otherwise it 
-delegates to direct allocation from the OS.
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). 
+In contrast to the rest of mimalloc, the arenas are shared between 
+threads and need to be accessed using atomic operations.

-In the future, we can expose an API to manually add more arenas which
-is sometimes needed for embedded devices or shared memory for example.
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas 
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).

-The arena allocation needs to be thread safe and we use a lock-free scan
-with on-demand coalescing.
+The arena allocation needs to be thread safe and we use an atomic
+bitmap to allocate. The current implementation of the bitmap can
+only do this within a field (`uintptr_t`) so we can allocate at most
+blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
+can lead to fragmentation but fortunately most objects will be regions
+of 256MiB in practice.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
@ -23,9 +31,10 @@ with on-demand coalescing.

 #include <string.h>  // memset

+#include "bitmap.inc.c"  // atomic bitmap
+
 // os.c
 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-//int   _mi_os_alloc_huge_os_pages(size_t pages, double max_secs, void** pstart, size_t* pages_reserved, size_t* psize) mi_attr_noexcept;
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);

 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
@ -38,23 +47,27 @@ int   _mi_os_numa_node_count(void);
  Arena allocation
 ----------------------------------------------------------- */

-#define MI_SEGMENT_ALIGN     MI_SEGMENT_SIZE
-#define MI_ARENA_BLOCK_SIZE  MI_SEGMENT_SIZE 
-#define MI_MAX_ARENAS        (64)

 // Block info: bit 0 contains the `in_use` bit, the upper bits the
 // size in count of arena blocks.
 typedef uintptr_t mi_block_info_t;
+#define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE/2)      // 32MiB
+#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_FIELD_BITS * MI_ARENA_BLOCK_SIZE)  // 2GiB
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)

 // A memory arena descriptor
 typedef struct mi_arena_s {
  uint8_t* start;                         // the start of the memory area
  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields
  int      numa_node;                     // associated NUMA node
  bool     is_zero_init;                  // is the arena zero initialized?
  bool     is_large;                      // large OS page allocated
-  _Atomic(uintptr_t)       block_bottom;  // optimization to start the search for free blocks
-  _Atomic(mi_block_info_t) blocks[1];     // `block_count` block info's
+  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t  blocks_map[1];       // bitmap of in-use blocks 
 } mi_arena_t;


@ -71,184 +84,57 @@ static _Atomic(uintptr_t)   mi_arena_count; // = 0
 // Use `0` as a special id for direct OS allocated memory.
 #define MI_MEMID_OS   0

-static size_t mi_memid_create(size_t arena_index, size_t block_index) {
-  mi_assert_internal(arena_index < 0xFF);
-  return ((block_index << 8) | ((arena_index+1) & 0xFF));
+static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
+  mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
+  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
 }

-static void mi_memid_indices(size_t memid, size_t* arena_index, size_t* block_index) {
+static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
  mi_assert_internal(memid != MI_MEMID_OS);
  *arena_index = (memid & 0xFF) - 1;
-  *block_index = (memid >> 8);
+  *bitmap_index = (memid >> 8);
 }

-/* -----------------------------------------------------------
-  Block info
----------------------------------------------------------- */
-
-static bool mi_block_is_in_use(mi_block_info_t info) {
-  return ((info&1) != 0);
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }

-static size_t mi_block_count(mi_block_info_t info) {
-  return (info>>1);
-}
-
-static mi_block_info_t mi_block_info_create(size_t bcount, bool in_use) {
-  return (((mi_block_info_t)bcount << 1) | (in_use ? 1 : 0));
-}
-
-
 /* -----------------------------------------------------------
  Thread safe allocation in an arena
 ----------------------------------------------------------- */
-
-static void* mi_arena_allocx(mi_arena_t* arena, size_t start_idx, size_t end_idx, size_t needed_bcount, bool* is_zero, size_t* block_index)
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) 
 {
-  // Scan linearly through all block info's
-  // Skipping used ranges, coalescing free ranges on demand.
-  mi_assert_internal(needed_bcount > 0);
-  mi_assert_internal(start_idx <= arena->block_count);
-  mi_assert_internal(end_idx <= arena->block_count);
-  _Atomic(mi_block_info_t)* block = &arena->blocks[start_idx];
-  _Atomic(mi_block_info_t)* end = &arena->blocks[end_idx];
-  while (block < end) {
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    size_t bcount = mi_block_count(binfo);
-    if (mi_block_is_in_use(binfo)) {
-      // in-use, skip ahead
-      mi_assert_internal(bcount > 0);
-      block += bcount;
-    }
-    else {
-      // free blocks
-      if (bcount==0) {
-        // optimization:
-        // use 0 initialized blocks at the end, to use single atomic operation
-        // initially to reduce contention (as we don't need to split)
-        if (block + needed_bcount > end) {
-          return NULL; // does not fit
-        }
-        else if (!mi_atomic_cas_weak(block, mi_block_info_create(needed_bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // we got it: return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = arena->is_zero_init;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
-
-      mi_assert_internal(bcount>0);
-      if (needed_bcount > bcount) {
-#if 0 // MI_NO_ARENA_COALESCE
-        block += bcount; // too small, skip to the next range
-        continue;
-#else
-        // too small, try to coalesce
-        _Atomic(mi_block_info_t)* block_next = block + bcount;
-        if (block_next >= end) {
-          return NULL; // does not fit
-        }
-        mi_block_info_t binfo_next = mi_atomic_read(block_next);
-        size_t bcount_next = mi_block_count(binfo_next);
-        if (mi_block_is_in_use(binfo_next)) {
-          // next block is in use, cannot coalesce
-          block += (bcount + bcount_next); // skip ahea over both blocks
-        }
-        else {
-          // next block is free, try to coalesce
-          // first set the next one to being used to prevent dangling ranges
-          if (!mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, true), binfo_next)) {
-            // someone else got in before us.. try again
-            continue;
-          }
-          else {
-            if (!mi_atomic_cas_strong(block, mi_block_info_create(bcount + bcount_next, true), binfo)) {  // use strong to increase success chance
-              // someone claimed/coalesced the block in the meantime
-              // first free the next block again..
-              bool ok = mi_atomic_cas_strong(block_next, mi_block_info_create(bcount_next, false), binfo_next); // must be strong
-              mi_assert(ok); UNUSED(ok);
-              // and try again
-              continue;
-            }
-            else {
-              // coalesced! try again
-              // todo: we could optimize here to immediately claim the block if the
-              // coalesced size is a fit instead of retrying. Keep it simple for now.
-              continue;
-            }
-          }
-        }
-#endif    
-      }
-      else {  // needed_bcount <= bcount
-        mi_assert_internal(needed_bcount <= bcount);
-        // it fits, claim the whole block
-        if (!mi_atomic_cas_weak(block, mi_block_info_create(bcount, true), binfo)) {
-          // ouch, someone else was quicker. Try again..
-          continue;
-        }
-        else {
-          // got it, now split off the needed part
-          if (needed_bcount < bcount) {
-            mi_atomic_write(block + needed_bcount, mi_block_info_create(bcount - needed_bcount, false));
-            mi_atomic_write(block, mi_block_info_create(needed_bcount, true));
-          }
-          // return a pointer to the claimed memory
-          ptrdiff_t idx = (block - arena->blocks);
-          *is_zero = false;
-          *block_index = idx;
-          return (arena->start + (idx*MI_ARENA_BLOCK_SIZE));
-        }
-      }
+  const size_t fcount = arena->field_count;
+  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
+  for (size_t visited = 0; visited < fcount; visited++, idx++) {
+    if (idx >= fcount) idx = 0;  // wrap around
+    if (mi_bitmap_try_claim_field(arena->blocks_map, idx, blocks, bitmap_idx)) {
+      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
+      return true;
    }
  }
-  // no success
-  return NULL;
+  return false;
 }

-// Try to reduce search time by starting from bottom and wrap around.
-static void* mi_arena_alloc(mi_arena_t* arena, size_t needed_bcount, bool* is_zero, size_t* block_index) 
-{
-  uintptr_t bottom = mi_atomic_read_relaxed(&arena->block_bottom);
-  void* p = mi_arena_allocx(arena, bottom, arena->block_count, needed_bcount, is_zero, block_index);
-  if (p == NULL && bottom > 0) {
-    // try again from the start
-    p = mi_arena_allocx(arena, 0, bottom, needed_bcount, is_zero, block_index);
-  }
-  if (p != NULL) {
-    mi_atomic_write(&arena->block_bottom, *block_index);
-  }
-  return p;
-}

 /* -----------------------------------------------------------
  Arena Allocation
 ----------------------------------------------------------- */

 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
-                                    bool* commit, bool* large, bool* is_zero,
-                                    size_t* memid) 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid) 
 {
-  size_t block_index = SIZE_MAX;
-  void* p = mi_arena_alloc(arena, needed_bcount, is_zero, &block_index);
-  if (p != NULL) {
-    mi_assert_internal(block_index != SIZE_MAX);
-    #if MI_DEBUG>=1
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_index];
-    mi_block_info_t binfo = mi_atomic_read(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo) >= needed_bcount);
-    #endif
-    *memid = mi_memid_create(arena_index, block_index);
-    *commit = true;           // TODO: support commit on demand?
-    *large = arena->is_large;
+  mi_bitmap_index_t bitmap_index;
+  if (mi_arena_alloc(arena, needed_bcount, &bitmap_index)) {
+    // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+    *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+    *memid   = mi_memid_create(arena_index, bitmap_index);
+    *commit  = true;           // TODO: support commit on demand?
+    *large   = arena->is_large;
+    return (arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE));
  }
-  return p;
+  return NULL;
 }

 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
@ -257,21 +143,19 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
 {
  mi_assert_internal(memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
-  *memid = MI_MEMID_OS;  
+  *memid   = MI_MEMID_OS;
  *is_zero = false;
  bool default_large = false;
  if (large==NULL) large = &default_large;  // ensure `large != NULL`

  // try to allocate in an arena if the alignment is small enough
-  // and if there is not too much waste around the `MI_ARENA_BLOCK_SIZE`.
-  if (alignment <= MI_SEGMENT_ALIGN &&
-      size >= 3*(MI_ARENA_BLOCK_SIZE/4) &&  // > 48MiB (not more than 25% waste)
-      !(size > MI_ARENA_BLOCK_SIZE && size < 3*(MI_ARENA_BLOCK_SIZE/2)) // ! <64MiB - 96MiB>
-     ) 
+  // and the object is not too large or too small.
+  if (alignment <= MI_SEGMENT_ALIGN && 
+      size <= MI_ARENA_MAX_OBJ_SIZE && 
+      size >= MI_ARENA_MIN_OBJ_SIZE)
  {
-    size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
-    size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-    int numa_node = _mi_os_numa_node(tld); // current numa node
+    const size_t bcount = mi_block_count_of_size(size);
+    const int numa_node = _mi_os_numa_node(tld); // current numa node

    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
    // try numa affine allocation
@ -302,7 +186,10 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,

  // finally, fall back to the OS
  *is_zero = true;
-  *memid = MI_MEMID_OS;
+  *memid   = MI_MEMID_OS;
+  if (*large) {
+    *large = mi_option_is_enabled(mi_option_large_os_pages); // try large OS pages only if enabled and allowed
+  }
  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
 }

@ -326,8 +213,8 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
  else {
    // allocated in an arena
    size_t arena_idx;
-    size_t block_idx;
-    mi_memid_indices(memid, &arena_idx, &block_idx);
+    size_t bitmap_idx;
+    mi_memid_indices(memid, &arena_idx, &bitmap_idx);
    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
    mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
    mi_assert_internal(arena != NULL);
@ -335,27 +222,17 @@ void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
      _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
-    mi_assert_internal(arena->block_count > block_idx);
-    if (arena->block_count <= block_idx) {
-      _mi_fatal_error("trying to free from non-existent block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
-    _Atomic(mi_block_info_t)* block = &arena->blocks[block_idx];
-    mi_block_info_t binfo = mi_atomic_read_relaxed(block);
-    mi_assert_internal(mi_block_is_in_use(binfo));
-    mi_assert_internal(mi_block_count(binfo)*MI_ARENA_BLOCK_SIZE >= size);
-    if (!mi_block_is_in_use(binfo)) {
+    const size_t blocks = mi_block_count_of_size(size);
+    bool ones = mi_bitmap_unclaim(arena->blocks_map, arena->field_count, blocks, bitmap_idx);
+    if (!ones) {
      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
      return;
    };
-    bool ok = mi_atomic_cas_strong(block, mi_block_info_create(mi_block_count(binfo), false), binfo);
-    mi_assert_internal(ok);
-    if (!ok) {
-      _mi_warning_message("unable to free arena block: %p, info 0x%zx", p, binfo);
-    }
-    if (block_idx < mi_atomic_read_relaxed(&arena->block_bottom)) {
-      mi_atomic_write(&arena->block_bottom, block_idx);
-    }
  }
 }

@ -367,8 +244,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
  mi_assert_internal(arena != NULL);
  mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
  mi_assert_internal(arena->block_count > 0);
-  mi_assert_internal(mi_mem_is_zero(arena->blocks,arena->block_count*sizeof(mi_block_info_t)));
-
+  
  uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
  if (i >= MI_MAX_ARENAS) {
    mi_atomic_subu(&mi_arena_count, 1);
@ -385,40 +261,51 @@ static bool mi_arena_add(mi_arena_t* arena) {
 #include <errno.h> // ENOMEM

 // reserve at a specific numa node
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
  if (pages==0) return 0;
  if (numa_node < -1) numa_node = -1;
  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
  size_t hsize = 0;
  size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, pages*500, &pages_reserved, &hsize);
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
  if (p==NULL || pages_reserved==0) {
    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
    return ENOMEM;
  }
  _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
  
-  size_t bcount = hsize / MI_ARENA_BLOCK_SIZE;
-  size_t asize = sizeof(mi_arena_t) + (bcount*sizeof(mi_block_info_t));  // one too much
+  size_t bcount = mi_block_count_of_size(hsize);
+  size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
+  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
  if (arena == NULL) {
    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
    return ENOMEM;
  }
  arena->block_count = bcount;
-  arena->start = (uint8_t*)p;
-  arena->block_bottom = 0;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)p;  
  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
  arena->is_large = true;
  arena->is_zero_init = true;
-  memset(arena->blocks, 0, bcount * sizeof(mi_block_info_t));
+  arena->search_idx = 0;
+  arena->blocks_dirty = &arena->blocks_map[bcount];
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
+  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    mi_bitmap_claim(arena->blocks_map, fields, post, postidx, NULL); 
+  }
+  
  mi_arena_add(arena);
  return 0;
 }


 // reserve huge pages evenly among all numa nodes. 
-int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t timeout_msecs) mi_attr_noexcept {
  if (pages == 0) return 0;

  // pages per numa node
@ -426,12 +313,13 @@ int mi_reserve_huge_os_pages_interleave(size_t pages) mi_attr_noexcept {
  if (numa_count <= 0) numa_count = 1;
  const size_t pages_per = pages / numa_count;
  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs / numa_count) + 50;
  
  // reserve evenly among numa nodes
  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
    size_t node_pages = pages_per;  // can be 0
    if ((size_t)numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node);
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
    if (err) return err;
    if (pages < node_pages) {
      pages = 0;
@ -448,7 +336,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
  UNUSED(max_secs);
  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages);  
+  int err = mi_reserve_huge_os_pages_interleave(pages, (size_t)(max_secs * 1000.0));  
  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
  return err;
 }
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This file is meant to be included in other files for efficiency.
+It implements a bitmap that can set/reset sequences of bits atomically
+and is used to concurrently claim memory ranges. 
+
+A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
+
+A current limitation is that the bit sequences cannot cross fields 
+and that the sequence must be smaller or equal to the bits in a field.
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_C
+#define MI_BITMAP_C
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+
+// An atomic bitmap of `uintptr_t` fields
+typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*           mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+
+// The bit mask for a given number of blocks at a specified bit index.
+static uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  return ((((uintptr_t)1 << count) - 1) << bitidx);
+}
+
+
+/* -----------------------------------------------------------
+  Use bit scan forward/reverse to quickly find the first zero bit if it is available
+----------------------------------------------------------- */
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#include <limits.h> // LONG_MAX
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single 
+// field at `idx` in `bitmap`. Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) 
+{  
+  mi_assert_internal(bitmap_idx != NULL);
+  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map  = mi_atomic_read(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const uintptr_t mask = mi_bitmap_mask_(count, 0);
+  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_BITSCAN
+  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(field);
+        continue;
+      }
+      else {
+        // success, we claimed the bits!        
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+static inline bool mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+  for (size_t idx = 0; idx < bitmap_fields; idx++) {
+    if (mi_bitmap_try_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously
+static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal((bitmap[idx] & mask) == mask);
+  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == 0);
+  return ((mi_atomic_read(&bitmap[idx]) & mask) == mask);
+}
+
+#endif
--- a/src/init.c
+++ b/src/init.c
@ -457,9 +457,8 @@ static void mi_process_load(void) {
  }

  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
-    // double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
-    mi_reserve_huge_os_pages_interleave(pages);
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);    
+    mi_reserve_huge_os_pages_interleave(pages, pages*500);
  }
 }

--- a/src/os.c
+++ b/src/os.c
@ -939,16 +939,18 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
    
    // check for timeout
-    mi_msecs_t elapsed = _mi_clock_end(start_t);
-    if (page >= 1) {
-      mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
-      if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
-        elapsed = max_msecs + 1; 
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
      }
-    }
-    if (elapsed > max_msecs) {
-      _mi_warning_message("huge page allocation timed out\n");
-      break;
    }
  }
  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
@ -1045,9 +1047,10 @@ int _mi_os_numa_node_count(void) {

 int _mi_os_numa_node(mi_os_tld_t* tld) {
  UNUSED(tld);
-  int numa_node = mi_os_numa_nodex();
-  // never more than the node count and >= 0
  int numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  int numa_node = mi_os_numa_nodex();
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
  if (numa_node < 0) numa_node = 0;  
  return numa_node;
--- a/src/page.c
+++ b/src/page.c
@ -436,15 +436,15 @@ void _mi_page_retire(mi_page_t* page) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)

-static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) {
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
  UNUSED(stats);
  #if (MI_SECURE<=2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
  mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
-  size_t bsize = page->block_size;
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  const size_t bsize = page->block_size;

  // initialize a randomized free list
  // set up `slice_count` slices to alternate between
@ -452,8 +452,8 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
  while ((extend >> shift) == 0) {
    shift--;
  }
-  size_t slice_count = (size_t)1U << shift;
-  size_t slice_extend = extend / slice_count;
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
  mi_assert_internal(slice_extend >= 1);
  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
@ -467,12 +467,12 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
  // set up first element
  size_t current = _mi_heap_random(heap) % slice_count;
  counts[current]--;
-  page->free = blocks[current];
+  mi_block_t* const free_start = blocks[current];
  // and iterate through the rest
  uintptr_t rnd = heap->random;
  for (size_t i = 1; i < extend; i++) {
    // call random_shuffle only every INTPTR_SIZE rounds
-    size_t round = i%MI_INTPTR_SIZE;
+    const size_t round = i%MI_INTPTR_SIZE;
    if (round == 0) rnd = _mi_random_shuffle(rnd);
    // select a random next slice index
    size_t next = ((rnd >> 8*round) & (slice_count-1));
@ -482,34 +482,39 @@ static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, si
    }
    // and link the current block to it
    counts[next]--;
-    mi_block_t* block = blocks[current];
+    mi_block_t* const block = blocks[current];
    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
    current = next;
  }
-  mi_block_set_next(page, blocks[current], NULL);             // end of the list
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
  heap->random = _mi_random_shuffle(rnd);
 }

-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
 {
  UNUSED(stats);
+  #if (MI_SECURE <= 2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
+  #endif
  mi_assert_internal(page->capacity + extend <= page->reserved);
-  void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
-  size_t bsize = page->block_size;
-  mi_block_t* start = mi_page_block_at(page, page_area, page->capacity);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  const size_t bsize = page->block_size;
+  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
  
  // initialize a sequential free list
-  mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
+  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
  mi_block_t* block = start;
  while(block <= last) {
    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
    mi_block_set_next(page,block,next);
    block = next;
  }  
-  mi_block_set_next(page, last, NULL);
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
  page->free = start;
 }

--- a/src/stats.c
+++ b/src/stats.c
@ -130,19 +130,23 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const
  char buf[32];
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "b");
-  double base = (unit == 0 ? 1000.0 : 1024.0);
+  const int64_t base = (unit == 0 ? 1000 : 1024);
  if (unit>0) n *= unit;

-  double pos = (double)(n < 0 ? -n : n);
-  if (pos < base)
-    snprintf(buf,len, "%d %s ", (int)n, suffix);
-  else if (pos < base*base)
-    snprintf(buf, len, "%.1f k%s", (double)n / base, suffix);
-  else if (pos < base*base*base)
-    snprintf(buf, len, "%.1f m%s", (double)n / (base*base), suffix);
-  else
-    snprintf(buf, len, "%.1f g%s", (double)n / (base*base*base), suffix);
-
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    snprintf(buf, len, "%d %s ", (int)n, suffix);
+  }
+  else {
+    int64_t divider = base;
+    const char* magnitude = "k";
+    if (pos >= divider*base) { divider *= base; magnitude = "m"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "g"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    snprintf(buf, len, "%ld.%ld %s%s", whole, frac1, magnitude, suffix);
+  }
  _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
 }

@ -199,8 +203,10 @@ static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg
 }

 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out) {
-  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
-  _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg);
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }