Merge branch 'dev' into dev-exp

2025-09-18 20:24:47 +03:00 · 2019-08-11 09:00:39 -07:00 · 2019-08-11 09:00:39 -07:00 · d71e67b689
commit d71e67b689
parent a90dc02984 3a171624bd
12 changed files with 157 additions and 123 deletions
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -314,16 +314,37 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
 }

+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
 static inline uintptr_t mi_page_thread_id(const mi_page_t* page) {
-  return (page->flags.xthread_id << MI_PAGE_FLAGS_BITS);
+  return (page->flags & ~MI_PAGE_FLAGS_MASK);
 }

 static inline void mi_page_init_flags(mi_page_t* page, uintptr_t thread_id) {
-  page->flags.value = 0;
-  page->flags.xthread_id = (thread_id >> MI_PAGE_FLAGS_BITS);
-  mi_assert(page->flags.value == thread_id);
+  page->flags = thread_id;  
 }

+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return ((page->flags & 0x01) != 0);
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  if (in_full) page->flags |= 0x01;
+          else page->flags &= ~0x01;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return ((page->flags & 0x02) != 0);
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  if (has_aligned) page->flags |= 0x02;
+              else page->flags &= ~0x02;
+}
+
+
 // -------------------------------------------------------------------
 // Encoding/Decoding the free list next pointers
 // -------------------------------------------------------------------
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -94,16 +94,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MEDIUM_SIZE_MAX                (MI_MEDIUM_PAGE_SIZE/4)   // 128kb on 64-bit
 #define MI_LARGE_SIZE_MAX                 (MI_LARGE_PAGE_SIZE/4)    // 1Mb on 64-bit
 #define MI_LARGE_WSIZE_MAX                (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT)
-
+#define MI_HUGE_SIZE_MAX                  (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)  // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)

 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
 #define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)

 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
-#define MI_BIN_HUGE  (70U)
+#define MI_BIN_HUGE  (73U)

-#if (MI_LARGE_WSIZE_MAX > 393216)
+#if (MI_LARGE_WSIZE_MAX >= 655360)
 #error "define more bins"
 #endif

@ -123,25 +123,12 @@ typedef enum mi_delayed_e {
 } mi_delayed_t;


-// Use the lowest two bits of a thread id for the `in_full` and `has_aligned` flags
+// Use the bottom 2 bits for the `in_full` and `has_aligned` flags
+// and the rest for the threadid (we assume tid's never use those lower 2 bits).
 // This allows a single test in `mi_free` to check for unlikely cases
 // (namely, non-local free, aligned free, or freeing in a full page)
-#define MI_PAGE_FLAGS_BITS         (2)
-#define MI_PAGE_FLAGS_TID_BITS (MI_INTPTR_SIZE*8 - MI_PAGE_FLAGS_BITS)
-typedef union mi_page_flags_u {
-  uintptr_t value;
-  struct {
-    #ifdef MI_BIG_ENDIAN
-    uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS;
-    #endif
-    uintptr_t in_full : 1;
-    uintptr_t has_aligned : 1;
-    #ifndef MI_BIG_ENDIAN
-    uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS;
-    #endif
-  };
-} mi_page_flags_t;
-
+#define MI_PAGE_FLAGS_MASK  ((uintptr_t)0x03)
+typedef uintptr_t mi_page_flags_t;

 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
@ -339,10 +326,13 @@ typedef struct mi_stats_s {
  mi_stat_count_t commit_calls;
  mi_stat_count_t threads;
  mi_stat_count_t huge;
+  mi_stat_count_t giant;
  mi_stat_count_t malloc;
  mi_stat_count_t segments_cache;
  mi_stat_counter_t page_no_retire;
  mi_stat_counter_t searches;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t giant_count;
 #if MI_STAT>1
  mi_stat_count_t normal[MI_BIN_HUGE+1];
 #endif
@ -393,12 +383,8 @@ typedef struct mi_segments_tld_s {
 } mi_segments_tld_t;

 // OS thread local data
-typedef struct mi_os_tld_s {
-  uintptr_t           mmap_next_probable;  // probable next address start allocated by mmap (to guess which path to take on alignment)
-  void*               mmap_previous;       // previous address returned by mmap
-  uint8_t*            pool;                // pool of segments to reduce mmap calls on some platforms
-  size_t              pool_available;      // bytes available in the pool
-  mi_stats_t*         stats;               // points to tld stats
+typedef struct mi_os_tld_s {  
+  mi_stats_t*         stats;        // points to tld stats
 } mi_os_tld_t;

 // Thread local data
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -43,7 +43,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* heap, size_t size, size_t
  if (p == NULL) return NULL;

  // .. and align within the allocation
-  _mi_ptr_page(p)->flags.has_aligned = true;
+  mi_page_set_has_aligned( _mi_ptr_page(p), true );
  uintptr_t adjust = alignment - (((uintptr_t)p + offset) % alignment);
  mi_assert_internal(adjust % sizeof(uintptr_t) == 0);
  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
--- a/src/alloc.c
+++ b/src/alloc.c
@ -174,7 +174,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
    if (mi_unlikely(mi_page_all_free(page))) {
      _mi_page_retire(page);
    }
-    else if (mi_unlikely(page->flags.in_full)) {
+    else if (mi_unlikely(mi_page_is_in_full(page))) {
      _mi_page_unfull(page);
    }
  }
@ -194,7 +194,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p


 static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool local, void* p) {
-  mi_block_t* block = (page->flags.has_aligned ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_block_t* block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
  _mi_free_block(page, local, block);
 }

@ -237,7 +237,7 @@ void mi_free(void* p) mi_attr_noexcept
 #endif

  uintptr_t tid = _mi_thread_id();
-  if (mi_likely(tid == page->flags.value)) {  
+  if (mi_likely(tid == page->flags)) {  // if equal, the thread id matches and it is not a full page, nor has aligned blocks
    // local, and not full or aligned
    mi_block_t* block = (mi_block_t*)p;
    mi_block_set_next(page, block, page->local_free);  
@ -273,7 +273,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
  const mi_segment_t* segment = _mi_ptr_segment(p);
  const mi_page_t* page = _mi_segment_page_of(segment,p);
  size_t size = page->block_size;
-  if (mi_unlikely(page->flags.has_aligned)) {
+  if (mi_unlikely(mi_page_has_aligned(page))) {
    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
    return (size - adjust);
--- a/src/heap.c
+++ b/src/heap.c
@ -246,7 +246,12 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_

  // stats
  if (page->block_size > MI_LARGE_SIZE_MAX) {
-    mi_heap_stat_decrease(heap,huge,page->block_size);
+    if (page->block_size > MI_HUGE_SIZE_MAX) {
+      _mi_stat_decrease(&heap->tld->stats.giant,page->block_size);
+    }
+    else {
+      _mi_stat_decrease(&heap->tld->stats.huge, page->block_size);
+    }
  }
  #if (MI_STAT>1)
  size_t inuse = page->used - page->thread_freed;
--- a/src/init.c
+++ b/src/init.c
@ -42,8 +42,8 @@ const mi_page_t _mi_page_empty = {
    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
-    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), /* 69 */ \
-    QNULL(MI_LARGE_WSIZE_MAX + 1  /* 393216, Huge queue */), \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
+    QNULL(MI_LARGE_WSIZE_MAX + 1  /* 655360, Huge queue */), \
    QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ }

 #define MI_STAT_COUNT_NULL()  {0,0,0,0}
@ -63,9 +63,8 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, \
-  { 0, 0 } \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
  MI_STAT_COUNT_END_NULL()

 // --------------------------------------------------------
@ -98,8 +97,8 @@ static mi_tld_t tld_main = {
  0,
  &_mi_heap_main,
  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, NULL, NULL, 0, tld_main_stats },          // os
-  { MI_STATS_NULL }                              // stats
+  { tld_main_stats },          // os
+  { MI_STATS_NULL }            // stats
 };

 mi_heap_t _mi_heap_main = {
--- a/src/memory.c
+++ b/src/memory.c
@ -126,7 +126,8 @@ Commit from a region
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. 
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) {
+static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) 
+{
  size_t mask = mi_region_block_mask(blocks,bitidx);
  mi_assert_internal(mask != 0);
  mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);
@ -139,7 +140,13 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
      start = ALLOCATING;  // try to start allocating
    }
    else if (start == ALLOCATING) {
-      mi_atomic_yield(); // another thead is already allocating.. wait it out
+      // another thead is already allocating.. wait it out
+      // note: the wait here is not great (but should not happen often). Another
+      // strategy might be to just allocate another region in parallel. This tends
+      // to be bad for benchmarks though as these often start many threads at the 
+      // same time leading to the allocation of too many regions. (Still, this might
+      // be the most performant and it's ok on 64-bit virtual memory with over-commit.)
+      mi_atomic_yield(); 
      continue;
    }    
  } while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(&region->start, ALLOCATING, NULL) );
@ -183,47 +190,35 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. 
 // (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) {
+static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) 
+{
  mi_assert_internal(p != NULL && id != NULL);
  mi_assert_internal(blocks < MI_REGION_MAP_BITS);

  const uintptr_t mask = mi_region_block_mask(blocks,0);
  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  size_t bitidx ;
-  uintptr_t map;
-  uintptr_t newmap;
-  do {   // while no atomic claim success and not all bits seen
-    // find the first free range of bits
-    map = mi_atomic_read(&region->map);
-    size_t m = map;
-    bitidx = 0;
-    do {
-      // skip ones
-      while ((m&1) != 0) { bitidx++; m>>=1; }
-      // count zeros
-      mi_assert_internal((m&1)==0);
-      size_t zeros = 1;
-      m >>= 1;
-      while(zeros < blocks && (m&1)==0) { zeros++; m>>=1; }
-      if (zeros == blocks) break; // found a range that fits
-      bitidx += zeros;    
-    }
-    while(bitidx <= bitidx_max);
-    if (bitidx > bitidx_max) {
-      return true;  // no error, but could not find a range either
-    }

-    // try to claim it
-    mi_assert_internal( (mask << bitidx) >> bitidx == mask ); // no overflow?
-    mi_assert_internal( (map & (mask << bitidx)) == 0);         // fits in zero range
-    newmap = map | (mask << bitidx);
-    mi_assert_internal((newmap^map) >> bitidx == mask); 
+  // scan linearly for a free range of zero bits
+  uintptr_t map = mi_atomic_read(&region->map);
+  uintptr_t m   = mask;    // the mask shifted by bitidx
+  for(size_t bitidx = 0; bitidx <= bitidx_max; bitidx++, m <<= 1) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?      
+      uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_compare_exchange(&region->map, newmap, map)) {
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(&region->map);        
+      }
+      else {
+        // success, we claimed the bits
+        // now commit the block memory -- this can still fail
+        return mi_region_commit_blocks(region, idx, bitidx, blocks, size, commit, p, id, tld);
+      }
+    }
  }
-  while(!mi_atomic_compare_exchange(&region->map, newmap, map)); 
-
-  // success, we claimed the blocks atomically
-  // now commit the block memory -- this can still fail
-  return mi_region_commit_blocks(region, idx, bitidx, blocks, size, commit, p, id, tld);
+  // no error, but also no bits found
+  return true;  
 }

 // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
@ -274,13 +269,14 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t*
  size_t count = mi_atomic_read(&regions_count);
  size_t idx = mi_atomic_read(&region_next_idx);
  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (!mi_region_try_alloc_blocks(idx%count, blocks, size, commit, &p, id, tld)) return NULL; // error
+    if (idx >= count) idx = 0;  // wrap around
+    if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
    if (p != NULL) break;    
  }

  if (p == NULL) {
-    // no free range in existing regions -- try to extend beyond the count
-    for (idx = count; idx < MI_REGION_MAX; idx++) {
+    // no free range in existing regions -- try to extend beyond the count.. but at most 4 regions
+    for (idx = count; idx < count + 4 && idx < MI_REGION_MAX; idx++) {
      if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
      if (p != NULL) break;
    }
--- a/src/os.c
+++ b/src/os.c
@ -225,27 +225,29 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,

 #elif defined(__wasi__)
 static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
-  uintptr_t base = __builtin_wasm_memory_size(0) * os_page_size;
+  uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
-  size_t alloc_size = aligned_base - base + size;
-  mi_assert(alloc_size >= size);
+  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
+  mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
  if (alloc_size < size) return NULL;
-  if (__builtin_wasm_memory_grow(0, alloc_size / os_page_size) == SIZE_MAX) {
+  if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
    errno = ENOMEM;
    return NULL;
  }
-  return (void*) aligned_base;
+  return (void*)aligned_base;
 }
 #else
 static void* mi_unix_mmapx(size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
  void* p = NULL;
  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
-  // on 64-bit systems, use a special area for 4MiB aligned allocations
+  // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
  static volatile intptr_t aligned_base = ((intptr_t)1 << 42); // starting at 4TiB
-  if (try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE)==0 && (aligned_base%try_alignment)==0) {
+  if (try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE)==0) {
    intptr_t hint = mi_atomic_add(&aligned_base,size) - size;
-    p = mmap((void*)hint,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
+    if (hint%try_alignment == 0) {
+      p = mmap((void*)hint,size,protect_flags,flags,fd,0);
+      if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
+    }
  }
  #endif
  if (p==NULL) {
@ -273,10 +275,10 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags)
  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
  #endif
  #if defined(VM_MAKE_TAG)
-  // darwin: tracking anonymous page with a specific ID all up to 98 are taken officially but LLVM sanitizers had taken 99
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
  fd = VM_MAKE_TAG(100);
  #endif
-  if (large_os_page_size > 0 && use_large_os_page(size, try_alignment)) {
+  if (use_large_os_page(size, try_alignment)) {
    int lflags = flags;
    int lfd = fd;
    #ifdef MAP_ALIGNED_SUPER
@ -308,7 +310,7 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags)
 #endif

 // Primitive allocation from the OS.
-// Note: the `alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
 static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, mi_stats_t* stats) {
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  if (size == 0) return NULL;
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -177,7 +177,7 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif

 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (page->flags.in_full ? MI_BIN_FULL : _mi_bin(page->block_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
  mi_heap_t* heap = page->heap;
  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
@ -187,10 +187,10 @@ static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
 }

 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (page->flags.in_full ? MI_BIN_FULL : _mi_bin(page->block_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
  mi_assert_internal(bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(page->flags.in_full || page->block_size == pq->block_size);
+  mi_assert_internal(mi_page_is_in_full(page) || page->block_size == pq->block_size);
  return pq;
 }

@ -245,7 +245,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue))  || (page->flags.in_full && mi_page_queue_is_full(queue)));
+  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == queue->last)  queue->last = page->prev;
@ -260,7 +260,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  page->next = NULL;
  page->prev = NULL;
  page->heap = NULL;
-  page->flags.in_full = false;
+  mi_page_set_in_full(page,false);
 }


@ -269,9 +269,9 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
  mi_assert_internal(!mi_page_queue_contains(queue, page));
  mi_assert_internal(page->block_size == queue->block_size ||
                      (page->block_size > MI_LARGE_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
-                        (page->flags.in_full && mi_page_queue_is_full(queue)));
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

-  page->flags.in_full = mi_page_queue_is_full(queue);
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
  page->heap = heap;
  page->next = queue->first;
  page->prev = NULL;
@ -324,7 +324,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
    mi_heap_queue_first_update(page->heap, to);
  }

-  page->flags.in_full = mi_page_queue_is_full(to);
+  mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }

 size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
--- a/src/page.c
+++ b/src/page.c
@ -102,7 +102,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == page->heap->thread_id);
    mi_page_queue_t* pq = mi_page_queue_of(page);
    mi_assert_internal(mi_page_queue_contains(pq, page));
-    mi_assert_internal(pq->block_size==page->block_size || page->block_size > MI_LARGE_SIZE_MAX || page->flags.in_full);
+    mi_assert_internal(pq->block_size==page->block_size || page->block_size > MI_LARGE_SIZE_MAX || mi_page_is_in_full(page));
    mi_assert_internal(mi_heap_contains_queue(page->heap,pq));
  }
  return true;
@ -282,26 +282,26 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
 void _mi_page_unfull(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(page->flags.in_full);
+  mi_assert_internal(mi_page_is_in_full(page));

  _mi_page_use_delayed_free(page, MI_NO_DELAYED_FREE);
-  if (!page->flags.in_full) return;
+  if (!mi_page_is_in_full(page)) return;

  mi_heap_t* heap = page->heap;
  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
-  page->flags.in_full = false; // to get the right queue
+  mi_page_set_in_full(page, false); // to get the right queue
  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
-  page->flags.in_full = true;
+  mi_page_set_in_full(page, true);
  mi_page_queue_enqueue_from(pq, pqfull, page);
 }

 static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(!mi_page_immediate_available(page));
-  mi_assert_internal(!page->flags.in_full);
+  mi_assert_internal(!mi_page_is_in_full(page));

  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE);
-  if (page->flags.in_full) return;
+  if (mi_page_is_in_full(page)) return;

  mi_page_queue_enqueue_from(&page->heap->pages[MI_BIN_FULL], pq, page);
  mi_page_thread_free_collect(page);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
@ -349,11 +349,16 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
  mi_assert_internal(mi_tf_delayed(free) != MI_DELAYED_FREEING);
  #endif

-  page->flags.has_aligned = false;
+  mi_page_set_has_aligned(page, false);

  // account for huge pages here
  if (page->block_size > MI_LARGE_SIZE_MAX) {
-    _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
+    if (page->block_size > MI_HUGE_SIZE_MAX) {
+      _mi_stat_decrease(&page->heap->tld->stats.giant, page->block_size);
+    }
+    else {
+      _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
+    }
  }

  // remove from the page list
@ -377,7 +382,7 @@ void _mi_page_retire(mi_page_t* page) {
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_all_free(page));

-  page->flags.has_aligned = false;
+  mi_page_set_has_aligned(page, false);

  // don't retire too often..
  // (or we end up retiring and re-allocating most of the time)
@ -560,7 +565,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->thread_freed == 0);
  mi_assert_internal(page->next == NULL);
  mi_assert_internal(page->prev == NULL);
-  mi_assert_internal(page->flags.has_aligned == false);
+  mi_assert_internal(!mi_page_has_aligned(page));
  #if MI_SECURE
  mi_assert_internal(page->cookie != 0);
  #endif
@ -619,7 +624,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p

    // 3. If the page is completely full, move it to the `mi_pages_full`
    // queue so we don't visit long-lived pages too often.
-    mi_assert_internal(!page->flags.in_full && !mi_page_immediate_available(page));
+    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
    mi_page_to_full(page,pq);

    page = next;
@ -702,7 +707,14 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
  if (page != NULL) {
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(page->block_size == block_size);
-    _mi_stat_increase( &heap->tld->stats.huge, block_size);
+    if (page->block_size > MI_HUGE_SIZE_MAX) {
+      _mi_stat_increase(&heap->tld->stats.giant, block_size);
+      _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
+    }
+    else {
+      _mi_stat_increase(&heap->tld->stats.huge, block_size);
+      _mi_stat_counter_increase(&heap->tld->stats.huge_count, 1);
+    }
  }
  return page;
 }
--- a/src/stats.c
+++ b/src/stats.c
@ -106,8 +106,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_add(&stats->malloc, &src->malloc, 1);
  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_add(&stats->giant, &src->giant, 1);
  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
+  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
 #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
    if (src->normal[i].allocated > 0 || src->normal[i].freed > 0) {
@ -152,20 +155,29 @@ static void mi_print_count(int64_t n, int64_t unit, FILE* out) {
 }

 static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, FILE* out ) {
-  _mi_fprintf(out,"%10s:", msg);
-  mi_print_amount(stat->peak, unit, out);
-  if (unit!=0) {
+  _mi_fprintf(out,"%10s:", msg);  
+  if (unit>0) {
+    mi_print_amount(stat->peak, unit, out);
    mi_print_amount(stat->allocated, unit, out);
    mi_print_amount(stat->freed, unit, out);
-  }
-  if (unit>0) {
-    mi_print_amount(unit, (unit==0 ? 0 : 1), out);
+    mi_print_amount(unit, 1, out);
    mi_print_count(stat->allocated, unit, out);
    if (stat->allocated > stat->freed)
      _mi_fprintf(out, "  not all freed!\n");
    else
      _mi_fprintf(out, "  ok\n");
  }
+  else if (unit<0) {
+    mi_print_amount(stat->peak, 1, out);
+    mi_print_amount(stat->allocated, 1, out);
+    mi_print_amount(stat->freed, 1, out);
+    mi_print_amount(-unit, 1, out);
+    mi_print_count((stat->allocated / -unit), 0, out);
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, "  not all freed!\n");
+    else
+      _mi_fprintf(out, "  ok\n");
+  }
  else {
    _mi_fprintf(out, "\n");
  }
@ -219,10 +231,12 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_n
  mi_stat_count_t normal = { 0,0,0,0 };
  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out);
  mi_stat_print(&normal, "normal", 1, out);
-  mi_stat_print(&stats->huge, "huge", 1, out);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out);
+  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out);
  mi_stat_count_t total = { 0,0,0,0 };
  mi_stat_add(&total, &normal, 1);
  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_add(&total, &stats->giant, 1);
  mi_stat_print(&total, "total", 1, out);
  _mi_fprintf(out, "malloc requested:     ");
  mi_print_amount(stats->malloc.allocated, 1, out);
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -6,7 +6,6 @@
 #include <mimalloc.h>
 #include <mimalloc-override.h>  // redefines malloc etc.

-
 int main() {
  mi_version();
  void* p1 = malloc(78);