fix page info size and order; atomic page flags

2025-09-18 04:04:47 +03:00 · 2024-12-06 22:37:59 -08:00 · 2024-12-06 22:37:59 -08:00 · 659a9dd51d
commit 659a9dd51d
parent 5a5943ad33
10 changed files with 87 additions and 89 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
      list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
      message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
    else()
-      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2)
+      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2)
    endif()
  endif()
  if(MI_OVERRIDE)
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -80,10 +80,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))

 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))

 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@ -667,7 +667,8 @@ static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {


 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || 
+          (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
 }


@ -727,20 +728,33 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 //-----------------------------------------------------------
 // Page flags
 //-----------------------------------------------------------
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return mi_atomic_load_acquire(&page->xflags);
+}
+
+static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  if (set) {
+    mi_atomic_or_acq_rel(&page->xflags, newflag);
+  }
+  else {
+    mi_atomic_and_acq_rel(&page->xflags, ~newflag);
+  }
+}
+
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.x.in_full;
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
 }

 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.x.in_full = in_full;
+  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
 }

 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.x.has_aligned;
+  return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0);
 }

 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.x.has_aligned = has_aligned;
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
 }

 /* -------------------------------------------------------------------
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -167,8 +167,8 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {

 typedef struct mi_memid_os_info {
  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
  size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
 } mi_memid_os_info_t;

 typedef struct mi_memid_arena_info {
@ -224,26 +224,11 @@ typedef enum mi_owned_e {
 } mi_owned_t;


-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint32_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
+// The `in_full` and `has_aligned` page flags are put in the same field
+// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#define MI_PAGE_IN_FULL_QUEUE  MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED    MI_ZU(0x02)
+typedef size_t mi_page_flags_t;

 // Thread free list.
 // We use the bottom bit of the pointer for `mi_owned_t` flags
@ -287,23 +272,21 @@ typedef struct mi_page_s {
  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
  uint16_t                  reserved;          // number of blocks reserved in memory
  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
-
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
+  uint8_t                   retire_expire;     // expiration count for retired blocks

  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
+  _Atomic(mi_page_flags_t)  xflags;            // `in_full` and `has_aligned` flags 
+
  size_t                    block_size;        // size available in each block (always `>0`)  
  uint8_t*                  page_start;        // start of the blocks
-
+  uint8_t                   heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+                                               // padding
  #if (MI_ENCODE_FREELIST || MI_PADDING)
  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
  #endif

-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-
  mi_heap_t*                heap;              // heap this threads belong to.
  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
@ -317,10 +300,10 @@ typedef struct mi_page_s {
 // ------------------------------------------------------

 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
-#define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
+#define MI_PAGE_MIN_BLOCK_ALIGN           (64)                 // minimal block alignment in a page 
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation

-#if MI_DEBUG && MI_SIZE_SIZE == 8
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
 #else
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
--- a/src/arena.c
+++ b/src/arena.c
@ -596,7 +596,9 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
    }
  }
  #endif
-  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
+  if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
+    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n");
+  };
  const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
  const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
  mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
@ -1126,8 +1128,8 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
  return bit_set_count;
 }

-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
-  _mi_output_message("%s%s:\n", prefix, header);
+static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
+  _mi_output_message("%s:\n", header);
  size_t bit_count = 0;
  size_t bit_set_count = 0;
  for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
@ -1135,19 +1137,13 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
    size_t k = 0;
    mi_bchunk_t* chunk = &bitmap->chunks[i];

-    if (i<10)  { buf[k++] = ' '; }
-    if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); }
-    buf[k++] = ' ';
+    if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
+    else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
+    else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
    
    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
      if (j > 0 && (j % 4) == 0) {
-        buf[k++] = '\n';
-        _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
+        buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
      }
      if (bit_count < slice_count) {
        mi_bfield_t bfield = chunk->bfields[j];
@ -1164,9 +1160,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
      }
      bit_count += MI_BFIELD_BITS;
    }
-    _mi_output_message("%s  %s\n", prefix, buf);
+    _mi_output_message("  %s\n", buf);
  }
-  _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  _mi_output_message("  total ('x'): %zu\n", bit_set_count);
  return bit_set_count;
 }

@ -1183,12 +1179,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
    slice_total += arena->slice_count;
    _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
    if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, arena->slices_free, true);
+      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true);
    }
-    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, arena->slices_committed, false);
+    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false);
    // todo: abandoned slices
    if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, arena->slices_purge, false);
+      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false);
    }
  }
  if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
    return false;
  }
  // record the max clear 
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
  do {
    if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
+  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/
  return true;
 }

@ -1046,7 +1046,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
  { \
  /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
  MI_UNUSED(tseq); \
-  const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear);  /* mi_bitmap_chunk_count(bitmap) */ \
+  /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \
  const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */      /* space out threads */ \
  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
  const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
--- a/src/free.c
+++ b/src/free.c
@ -163,8 +163,9 @@ void mi_free(void* p) mi_attr_noexcept
  if mi_unlikely(page==NULL) return;

  const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
+  const mi_page_flags_t flags = mi_page_flags(page);
  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
      // thread-local, aligned, and not a full page
      mi_block_t* const block = (mi_block_t*)p;
      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
@ -176,7 +177,7 @@ void mi_free(void* p) mi_attr_noexcept
  }
  else {
    // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-    if mi_likely(page->flags.full_aligned == 0) {
+    if mi_likely(flags == 0) {
      // blocks are aligned (and not a full page)
      mi_block_t* const block = (mi_block_t*)p;
      mi_free_block_mt(page,block);
--- a/src/init.c
+++ b/src/init.c
@ -20,21 +20,21 @@ const mi_page_t _mi_page_empty = {
  0,       // capacity
  0,       // reserved capacity
  0,       // block size shift
-  0,       // heap tag
-  { 0 },   // flags
-  false,   // is_zero
  0,       // retire_expire
  NULL,    // local_free
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xflags
  0,       // block_size
  NULL,    // page_start
+  0,       // heap tag
+  false,   // is_zero
  #if (MI_PADDING || MI_ENCODE_FREELIST)
  { 0, 0 },
  #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
  NULL,       // xheap
  NULL, NULL, // next, prev
  NULL,       // subproc
-  { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE }  // memid
+  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
--- a/src/os.c
+++ b/src/os.c
@ -128,7 +128,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
    // different base? (due to alignment)
    if (memid.mem.os.base != base) {
      mi_assert(memid.mem.os.base <= addr);
-      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+      // mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
      base = memid.mem.os.base;
      if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); }
    }
@ -305,7 +305,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
  if (p != NULL) {
    *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
    memid->mem.os.base = os_base;
-    memid->mem.os.alignment = alignment;
+    // memid->mem.os.alignment = alignment;
    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
  }
  return p;
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 0
+#elif 1
 static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;
@ -347,6 +347,8 @@ int main(int argc, char** argv) {
  mi_collect(true);
  mi_debug_show_arenas(true,true,false);
  #endif
+  mi_collect(true);
+  mi_debug_show_arenas(true, true, false);
  // mi_stats_print(NULL);
 #else
  mi_stats_print(NULL);  // so we see rss/commit/elapsed