merge from dev

2025-08-24 00:04:48 +03:00 · 2024-12-31 14:20:00 -08:00 · 2024-12-31 14:20:00 -08:00 · 7ebdfac18b
commit 7ebdfac18b
parent fbae6a98d5 81d69d525e
37 changed files with 262 additions and 2832 deletions
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -248,7 +248,7 @@ extern "C" {
  // Forward Posix/Unix calls as well
  void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
  size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__) && !defined(__DragonFly__)
  size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
  #else
  size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
--- a/src/arena.c
+++ b/src/arena.c
@ -290,7 +290,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
  return p;
 }

-// allocate in a speficic arena
+// allocate in a specific arena
 static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid )
 {
@ -1009,5 +1009,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
  return err;
 }
-
-
--- a/src/heap.c
+++ b/src/heap.c
@ -59,7 +59,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  MI_UNUSED(pq);
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id);
  mi_assert_expensive(_mi_page_is_valid(page));
  return true;
 }
@ -405,9 +405,10 @@ void mi_heap_destroy(mi_heap_t* heap) {
 }

 // forcefully destroy all heaps in the current thread
-void _mi_heap_unsafe_destroy_all(void) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* curr = bheap->tld->heaps;
+void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  if (heap == NULL) return;
+  mi_heap_t* curr = heap->tld->heaps;
  while (curr != NULL) {
    mi_heap_t* next = curr->next;
    if (curr->no_reclaim) {
--- a/src/init.c
+++ b/src/init.c
@ -685,15 +685,20 @@ void mi_cdecl _mi_process_done(void) {
  if (process_done) return;
  process_done = true;

+  // get the default heap so we don't need to acces thread locals anymore
+  mi_heap_t* heap = mi_prim_get_default_heap();  // use prim to not initialize any heap
+  mi_assert_internal(heap != NULL);
+
  // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
  _mi_prim_thread_done_auto_done();

+
  #ifndef MI_SKIP_COLLECT_ON_EXIT
    #if (MI_DEBUG || !defined(MI_SHARED_LIB))
    // free all memory if possible on process exit. This is not needed for a stand-alone process
    // but should be done if mimalloc is statically linked into another shared library which
    // is repeatedly loaded/unloaded, see issue #281.
-    mi_collect(true /* force */ );
+    mi_heap_collect(heap, true /* force */ );
    #endif
  #endif

@ -701,9 +706,10 @@ void mi_cdecl _mi_process_done(void) {
  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
  // or C-runtime termination code.
  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
-    mi_collect(true /* force */);
-    _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
+    mi_heap_collect(heap, true /* force */);
+    _mi_heap_unsafe_destroy_all(heap);     // forcefully release all memory held by all heaps (of this thread only!)
    _mi_arena_unsafe_destroy_all();
+    _mi_segment_map_unsafe_destroy();
  }

  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
--- a/src/options.c
+++ b/src/options.c
@ -147,7 +147,7 @@ static mi_option_desc_t options[_mi_option_last] =
  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
 #if defined(MI_VISIT_ABANDONED)
-  { 1,   INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting heap blocks in abandonded segments; requires taking locks during reclaim.
+  { 1,   INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting heap blocks in abandoned segments; requires taking locks during reclaim.
 #else
  { 0,   UNINIT, MI_OPTION(visit_abandoned) },
 #endif
@ -368,7 +368,7 @@ static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop
 // (recursively) invoke malloc again to allocate space for the thread local
 // variables on demand. This is why we use a _mi_preloading test on such
 // platforms. However, C code generator may move the initial thread local address
-// load before the `if` and we therefore split it out in a separate funcion.
+// load before the `if` and we therefore split it out in a separate function.
 static mi_decl_thread bool recurse = false;

 static mi_decl_noinline bool mi_recurse_enter_prim(void) {
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -27,6 +27,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
 #include <fcntl.h>     // open, close, read, access
+#include <stdlib.h>    // getenv, arc4random_buf

 #if defined(__linux__)
  #include <features.h>
@ -247,7 +248,7 @@ static int unix_mmap_fd(void) {
  #if defined(VM_MAKE_TAG)
  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
  int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
+  if (os_tag < 100 || os_tag > 255) { os_tag = 254; }
  return VM_MAKE_TAG(os_tag);
  #else
  return -1;
@ -766,7 +767,7 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 #include <CommonCrypto/CommonRandom.h>

 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf
  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
 }
@ -776,7 +777,6 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
      defined(__sun) || \
      (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))

-#include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
  return true;
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -50,7 +50,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;

-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;

 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
@ -814,4 +814,4 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
  void _mi_allocator_done(void) {
    mi_allocator_done();
  }
-#endif
+#endif
--- a/src/segment-map.c
+++ b/src/segment-map.c
@ -57,6 +57,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo
    mi_memid_t memid;
    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
    if (part == NULL) return NULL;
+    part->memid = memid;
    mi_segmap_part_t* expected = NULL;
    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
      _mi_os_free(part, sizeof(mi_segmap_part_t), memid);
@ -124,3 +125,12 @@ static bool mi_is_valid_pointer(const void* p) {
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  return mi_is_valid_pointer(p);
 }
+
+void _mi_segment_map_unsafe_destroy(void) {
+  for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) {
+    mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL);
+    if (part != NULL) {
+      _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid);
+    }
+  }
+}
--- a/src/segment.c
+++ b/src/segment.c
@ -150,6 +150,23 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {

 /* --------------------------------------------------------------------------------
  Segment allocation
+  We allocate pages inside bigger "segments" (32 MiB on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.
+
+  Currently we have:
+  - small pages (64KiB) 
+  - medium pages (512KiB)
+  - large pages (4MiB),
+  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
+    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
+
+  The memory for a segment is usually committed on demand.
+  (i.e. we are careful to not touch the memory until we actually allocate a block there)
+
+  If a  thread ends, it "abandons" pages that still contain live blocks.
+  Such segments are abandoned and these can be reclaimed by still running threads,
+  (much like work-stealing).
 -------------------------------------------------------------------------------- */


@ -1068,7 +1085,7 @@ When a block is freed in an abandoned segment, the segment
 is reclaimed into that thread.

 Moreover, if threads are looking for a fresh segment, they
-will first consider abondoned segments -- these can be found
+will first consider abandoned segments -- these can be found
 by scanning the arena memory
 (segments outside arena memoryare only reclaimed by a free).
 ----------------------------------------------------------- */
@ -1324,7 +1341,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
  {
    mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process
    segment->abandoned_visits++;
-    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
+    // todo: should we respect numa affinity for abandoned reclaim? perhaps only for the first visit?
    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
    // Perhaps we can skip non-suitable ones in a better way?
    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);