Merge branch 'dev-slice' into fix-passing-heap-v2

2025-09-16 11:14:47 +03:00 · 2024-03-31 23:23:17 +02:00 · 2024-03-31 23:23:17 +02:00 · 8e3d7add99
commit 8e3d7add99
parent 6ce79eb3d7 f199b888b4
19 changed files with 929 additions and 820 deletions
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
-#define mi_decl_weak            
+#define mi_decl_weak
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
@ -40,7 +40,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
-#define mi_decl_weak           
+#define mi_decl_weak
 #endif

 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
@ -133,8 +133,8 @@ void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 size_t     _mi_arena_segment_abandoned_count(void);

 typedef struct mi_arena_field_cursor_s { // abstract
-  mi_arena_id_t  start;   
-  int            count;   
+  mi_arena_id_t  start;
+  int            count;
  size_t         bitmap_idx;
 } mi_arena_field_cursor_t;
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
@ -203,9 +203,9 @@ void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool
 void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
 void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);

 // "libc.c"
@ -437,9 +437,14 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
 // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
 // therefore we align one byte before `p`.
+// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
 static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_assert_internal(p != NULL);
-  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  #if MI_INTPTR_SIZE <= 4
+  return (p==NULL ? NULL : segment);
+  #else
+  return ((intptr_t)segment <= 0 ? NULL : segment);
+  #endif
 }

 static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
@ -454,6 +459,7 @@ static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {

 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_assert_internal(page!=NULL);
  mi_segment_t* segment = _mi_ptr_segment(page);
  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
  return segment;
@ -482,31 +488,28 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 }

 // Quick page start for initialized pages
-static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  return _mi_segment_page_start(segment, page, page_size);
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert_internal(page->page_start != NULL);
+  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
+  return page->page_start;
 }

 // Get the page containing the pointer
 static inline mi_page_t* _mi_ptr_page(void* p) {
+  mi_assert_internal(p!=NULL);
  return _mi_segment_page_of(_mi_ptr_segment(p), p);
 }

 // Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0);
-  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
-    return bsize;
-  }
-  else {
-    size_t psize;
-    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
-    return psize;
-  }
+  mi_assert_internal(page->block_size > 0);
+  return page->block_size;
 }

 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE));
+  return page->is_huge;
 }

 // Get the usable block size of a page without fixed padding.
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -13,9 +13,12 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
 // mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated.
-// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  are allocated. A segment is divided in slices (64KiB) from
+//                  which pages are allocated.
+// mi_page_t      : a "mimalloc" page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------


@ -89,10 +92,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif


-// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
-// but that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
-// another thread so most memory is available until it gets properly freed by the owning thread.
+// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// the owning thread).
 // #define MI_HUGE_PAGE_ABANDON 1


@ -192,17 +196,14 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif

-// Maximum slice offset (15)
-#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
-
-// Used as a special value to encode block sizes in 32 bits.
-#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
-
 // blocks up to this size are always allocated aligned
 #define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)

-// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_BLOCK_ALIGNMENT_MAX            (MI_SEGMENT_SIZE >> 1)
+
+// Maximum slice count (255) for which we can find the page for interior pointers
+#define MI_MAX_SLICE_OFFSET_COUNT         ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)


 // ------------------------------------------------------
@ -227,7 +228,7 @@ typedef enum mi_delayed_e {
  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;


@ -266,7 +267,6 @@ typedef uintptr_t mi_thread_free_t;
 // implement a monotonic heartbeat. The `thread_free` list is needed for
 // avoiding atomic operations in the common case.
 //
-//
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
@ -274,16 +274,13 @@ typedef uintptr_t mi_thread_free_t;
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
 // Notes:
-// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 8 words on 64-bit which helps the page index calculations
-//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
-//    and 12 are still good for address calculation)
-// - To limit the structure size, the `xblock_size` is 32-bits only; for
-//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
-// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+// - The size is 12 words on 64-bit which helps the page index calculations
+//   (and 14 words on 32-bit, and encoded free lists add 2 words)
+// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
 //   at least one block that will be added, or as already been added, to
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
@ -292,20 +289,24 @@ typedef struct mi_page_s {
  // "owned" by the segment
  uint32_t              slice_count;       // slices in this page (0 if not a page)
  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
-  uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init : 1;  // `true` if the page was initially zero initialized
-
+  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`)
+                                           // padding
  // layout like this to optimize access in `mi_malloc` and `mi_free`
  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
  uint16_t              reserved;          // number of blocks reserved in memory
  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero : 1;  // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire : 7; // expiration count for retired blocks
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire:7;   // expiration count for retired blocks

  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint32_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`)
  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+                                           // padding
+  size_t                block_size;        // size available in each block (always `>0`)
+  uint8_t*              page_start;        // start of the page area containing the blocks

  #if (MI_ENCODE_FREELIST || MI_PADDING)
  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
@ -317,10 +318,8 @@ typedef struct mi_page_s {
  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`

-  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
-  #if MI_INTPTR_SIZE==8
-  uintptr_t padding[1];
-  #endif
+  // 64-bit 11 words, 32-bit 13 words, (+2 for secure)
+  void* padding[1];
 } mi_page_t;


@ -331,14 +330,15 @@ typedef struct mi_page_s {

 typedef enum mi_page_kind_e {
  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
-  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;

 typedef enum mi_segment_kind_e {
  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+  MI_SEGMENT_HUGE,   // segment with just one huge page inside.
 } mi_segment_kind_t;

 // ------------------------------------------------------
@ -371,13 +371,17 @@ typedef mi_page_t  mi_slice_t;
 typedef int64_t    mi_msecs_t;


+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
 // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
 typedef enum mi_memkind_e {
  MI_MEM_NONE,      // not allocated
  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge os pages
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
  MI_MEM_ARENA      // allocated from an arena (the usual case)
 } mi_memkind_t;
@ -394,7 +398,7 @@ typedef struct mi_memid_os_info {
 typedef struct mi_memid_arena_info {
  size_t        block_index;        // index in the arena
  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // the arena can only be used for specific arena allocations
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;

 typedef struct mi_memid_s {
@ -402,46 +406,56 @@ typedef struct mi_memid_s {
    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
  bool          initially_committed;// `true` if the memory was originally allocated as committed
  bool          initially_zero;     // `true` if the memory was originally zero initialized
  mi_memkind_t  memkind;
 } mi_memid_t;


-// Segments are large allocated memory blocks (8mb on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
+// -----------------------------------------------------------------------------------------
+// Segments are large allocated memory blocks (8mb on 64 bit) from arenas or the OS.
+//
+// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
+// The start of a segment is this structure with a fixed number of slice entries (`slices`)
+// usually followed by a guard OS page and the actual allocation area with pages.
+// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`).
+// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent
+// slices part of the area, the `slice_offset` is the byte offset back to the first slice
+// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`).
+// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
+// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
+// large and huge pages span a variable amount of slices.
 typedef struct mi_segment_s {
  // constant fields
-  mi_memid_t        memid;              // memory id for arena allocation
-  bool              allow_decommit;
-  bool              allow_purge;
+  mi_memid_t        memid;              // memory id for arena/OS allocation
+  bool              allow_decommit;     // can we decommmit the memory
+  bool              allow_purge;        // can we purge the memory (reset or decommit)
  size_t            segment_size;

  // segment fields
-  mi_msecs_t        purge_expire;
-  mi_commit_mask_t  purge_mask;
-  mi_commit_mask_t  commit_mask;
+  mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
+  mi_commit_mask_t  purge_mask;         // slices that can be purged
+  mi_commit_mask_t  commit_mask;        // slices that are currently committed

  // from here is zero initialized
  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
  bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)

  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
  size_t            used;               // count of pages in use
  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`

  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+  size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.

  // layout like this to optimize access in `mi_free`
  mi_segment_kind_t kind;
  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment

-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one more for huge blocks with large alignment
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one extra final entry for huge blocks with large alignment
 } mi_segment_t;


@ -498,8 +512,6 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
  mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
  _Atomic(mi_block_t*)  thread_delayed_free;
  mi_threadid_t         thread_id;                           // thread this heap belongs too
  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
@ -511,6 +523,8 @@ struct mi_heap_s {
  size_t                page_retired_max;                    // largest retired index into the `pages` array.
  mi_heap_t*            next;                                // list of heaps per thread
  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };


@ -626,6 +640,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)

+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------